rm(list = ls())
library(ggplot2)
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(cluster)
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(caret) # advance package with more than 200 prediction models. ## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(naivebayes)## naivebayes 0.9.7 loaded
library(elasticnet)## Loading required package: lars
## Loaded lars 1.2
library(parallel)
library(tictoc)
library(doParallel)## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
library(glmnet)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
library(leaflet)
library(kknn)##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
library(leaps)
library(olsrr)##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(skimr)
library(forcats)
library(VIM)## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(randomForest)## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)## Loaded gbm 2.1.8
library(neuralnet)##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(xgboost)##
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:dplyr':
##
## slice
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
library(rpart.plot)
setwd("")
data = read.csv(file = "OnlineNewsPopularity.csv")*I am going to use a useful tool from previous year, parallel programming, so that we can compute faster:
# Function to start the parallel programming
startParallel = function(model = "The model") {
print(paste(model, "has started processing. Wait until it finishes."))
cores = detectCores() - 1
cl = makeCluster(cores)
registerDoParallel(cl)
return(cl)
}
# Function to stop the parallel programming
endParallel = function(cl, model = "The model") {
stopCluster(cl)
print(paste(model, "has finished."))
}summary(data)## url timedelta n_tokens_title n_tokens_content
## Length:39644 Min. : 8.0 Min. : 2.0 Min. : 0.0
## Class :character 1st Qu.:164.0 1st Qu.: 9.0 1st Qu.: 246.0
## Mode :character Median :339.0 Median :10.0 Median : 409.0
## Mean :354.5 Mean :10.4 Mean : 546.5
## 3rd Qu.:542.0 3rd Qu.:12.0 3rd Qu.: 716.0
## Max. :731.0 Max. :23.0 Max. :8474.0
## n_unique_tokens n_non_stop_words n_non_stop_unique_tokens
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.4709 1st Qu.: 1.0000 1st Qu.: 0.6257
## Median : 0.5392 Median : 1.0000 Median : 0.6905
## Mean : 0.5482 Mean : 0.9965 Mean : 0.6892
## 3rd Qu.: 0.6087 3rd Qu.: 1.0000 3rd Qu.: 0.7546
## Max. :701.0000 Max. :1042.0000 Max. :650.0000
## num_hrefs num_self_hrefs num_imgs num_videos
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 0.00
## Median : 8.00 Median : 3.000 Median : 1.000 Median : 0.00
## Mean : 10.88 Mean : 3.294 Mean : 4.544 Mean : 1.25
## 3rd Qu.: 14.00 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 1.00
## Max. :304.00 Max. :116.000 Max. :128.000 Max. :91.00
## average_token_length num_keywords data_channel_is_lifestyle
## Min. :0.000 Min. : 1.000 Min. :0.00000
## 1st Qu.:4.478 1st Qu.: 6.000 1st Qu.:0.00000
## Median :4.664 Median : 7.000 Median :0.00000
## Mean :4.548 Mean : 7.224 Mean :0.05295
## 3rd Qu.:4.855 3rd Qu.: 9.000 3rd Qu.:0.00000
## Max. :8.042 Max. :10.000 Max. :1.00000
## data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
## Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.178 Mean :0.1579 Mean :0.0586
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000
## data_channel_is_tech data_channel_is_world kw_min_min kw_max_min
## Min. :0.0000 Min. :0.0000 Min. : -1.00 Min. : 0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: -1.00 1st Qu.: 445
## Median :0.0000 Median :0.0000 Median : -1.00 Median : 660
## Mean :0.1853 Mean :0.2126 Mean : 26.11 Mean : 1154
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 4.00 3rd Qu.: 1000
## Max. :1.0000 Max. :1.0000 Max. :377.00 Max. :298400
## kw_avg_min kw_min_max kw_max_max kw_avg_max
## Min. : -1.0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 141.8 1st Qu.: 0 1st Qu.:843300 1st Qu.:172847
## Median : 235.5 Median : 1400 Median :843300 Median :244572
## Mean : 312.4 Mean : 13612 Mean :752324 Mean :259282
## 3rd Qu.: 357.0 3rd Qu.: 7900 3rd Qu.:843300 3rd Qu.:330980
## Max. :42827.9 Max. :843300 Max. :843300 Max. :843300
## kw_min_avg kw_max_avg kw_avg_avg self_reference_min_shares
## Min. : -1 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 3562 1st Qu.: 2382 1st Qu.: 639
## Median :1024 Median : 4356 Median : 2870 Median : 1200
## Mean :1117 Mean : 5657 Mean : 3136 Mean : 3999
## 3rd Qu.:2057 3rd Qu.: 6020 3rd Qu.: 3600 3rd Qu.: 2600
## Max. :3613 Max. :298400 Max. :43568 Max. :843300
## self_reference_max_shares self_reference_avg_sharess weekday_is_monday
## Min. : 0 Min. : 0.0 Min. :0.000
## 1st Qu.: 1100 1st Qu.: 981.2 1st Qu.:0.000
## Median : 2800 Median : 2200.0 Median :0.000
## Mean : 10329 Mean : 6401.7 Mean :0.168
## 3rd Qu.: 8000 3rd Qu.: 5200.0 3rd Qu.:0.000
## Max. :843300 Max. :843300.0 Max. :1.000
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday weekday_is_friday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1864 Mean :0.1875 Mean :0.1833 Mean :0.1438
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weekday_is_saturday weekday_is_sunday is_weekend LDA_00
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.02505
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.03339
## Mean :0.06188 Mean :0.06904 Mean :0.1309 Mean :0.18460
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.24096
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :0.92699
## LDA_01 LDA_02 LDA_03 LDA_04
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02501 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.02857
## Median :0.03334 Median :0.04000 Median :0.04000 Median :0.04073
## Mean :0.14126 Mean :0.21632 Mean :0.22377 Mean :0.23403
## 3rd Qu.:0.15083 3rd Qu.:0.33422 3rd Qu.:0.37576 3rd Qu.:0.39999
## Max. :0.92595 Max. :0.92000 Max. :0.92653 Max. :0.92719
## global_subjectivity global_sentiment_polarity global_rate_positive_words
## Min. :0.0000 Min. :-0.39375 Min. :0.00000
## 1st Qu.:0.3962 1st Qu.: 0.05776 1st Qu.:0.02838
## Median :0.4535 Median : 0.11912 Median :0.03902
## Mean :0.4434 Mean : 0.11931 Mean :0.03962
## 3rd Qu.:0.5083 3rd Qu.: 0.17783 3rd Qu.:0.05028
## Max. :1.0000 Max. : 0.72784 Max. :0.15549
## global_rate_negative_words rate_positive_words rate_negative_words
## Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.009615 1st Qu.:0.6000 1st Qu.:0.1852
## Median :0.015337 Median :0.7105 Median :0.2800
## Mean :0.016612 Mean :0.6822 Mean :0.2879
## 3rd Qu.:0.021739 3rd Qu.:0.8000 3rd Qu.:0.3846
## Max. :0.184932 Max. :1.0000 Max. :1.0000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity min_negative_polarity max_negative_polarity
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-0.3284 1st Qu.:-0.7000 1st Qu.:-0.1250
## Median :-0.2533 Median :-0.5000 Median :-0.1000
## Mean :-0.2595 Mean :-0.5219 Mean :-0.1075
## 3rd Qu.:-0.1869 3rd Qu.:-0.3000 3rd Qu.:-0.0500
## Max. : 0.0000 Max. : 0.0000 Max. : 0.0000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## Min. :0.0000 Min. :-1.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.1667
## Median :0.1500 Median : 0.00000 Median :0.5000
## Mean :0.2824 Mean : 0.07143 Mean :0.3418
## 3rd Qu.:0.5000 3rd Qu.: 0.15000 3rd Qu.:0.5000
## Max. :1.0000 Max. : 1.00000 Max. :0.5000
## abs_title_sentiment_polarity shares
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 946
## Median :0.0000 Median : 1400
## Mean :0.1561 Mean : 3395
## 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :1.0000 Max. :843300
https://code.datasciencedojo.com/datasciencedojo/datasets/tree/master/Online%20News%20Popularity
K. Fernandes, P. Vinagre and P. Cortez. A Proactive Intelligent Decision Support System for Predicting the Popularity of Online News. Proceedings of the 17th EPIA 2015 - Portuguese Conference on Artificial Intelligence, September, Coimbra, Portugal
This data set summarizes many features about published articles during two years. The target is to predict the number of shares of an article in social media.
We are gonna be capable of predicting by using classification and advanced regression tools the number of shares and to do so, we will use categorical and numerical variables. The implementation of 2 testing sets, one for training and testing and other one to obtain the truly final performance of the tools presented.
-URL represents the direction of the article.
-Timedelta the days passed between publication and data collection.
-N_Tokens_Title is the number of words in title.
-N_Unique_Tokens is the number of unique words.
-Num_Hrefs for the number of links.
-Data_Channel_Is_XXXXX for the type of content that is published.
-Kw referes to the worst and best keywords depending on the number of shares.
-Self_Reference_ are the shares of Referenced articles, maximum, minimum and the average.
-Weekday refers to what day of the week is the article published on.
-Is_Weekend whether the article was published or not on weekend.
-Lda remarks the closeness to a LDA Topic.
-Global_Subjectivity, Global_Sentiment_Polarity, Global_Rate, represents respectively the world rate of words in content, such as subjectivity, sentiment or possitive and negative ratings.
-Rate_Positive_Words / Rate_Negative_Words represents the amount of negative and possitive rating amont tokens that are non-neutral.
-Polarity variables are related to Positive and Negative statistics of the polarity.
-Title variables define how subjective or polar a title is, and Abs_Title defines it in absolute value, without positivity or negativity.
-Shares, the last variable is the most important one because is the one we are going to predict and it is the number of times a source is shared on the media.
First of all, let´s find possible outliers.
hist(rowMeans(is.na(data)), main = "Histogram of the completed data", xlab = "Proportion of missing values", ylab = "Number of rows")hist(colMeans(is.na(data)), main = "Histogram of the completed data", xlab = "Proportion of missing values", ylab = "N-Columns")#No missing values found at all, it seems the data is pretty accurate. In order to remove noise, we are going to mix some columns into one, to reduce computing time and noise.
Let´s group all Weeks columns into just one:
data_original = data
nrows = nrow(data)
data$weekday = NA
for(i in 1:nrows){
if(data$weekday_is_monday[i] == 1){
data$weekday[i] = "Monday"
}
else if(data$weekday_is_tuesday[i] == 1){
data$weekday[i] = "Tuesday"
}
else if(data$weekday_is_wednesday[i] == 1){
data$weekday[i] = "Wednesday"
}
else if(data$weekday_is_thursday[i] == 1){
data$weekday[i] = "Thursday"
}
else if(data$weekday_is_friday[i] == 1){
data$weekday[i] = "Friday"
}
else if(data$weekday_is_saturday[i] == 1){
data$weekday[i] = "Saturday"
}
else if(data$weekday_is_sunday[i] == 1){
data$weekday[i] = "Sunday"
}
}Now let´s apply the same process but for data channel types:
data$channel = NA
for(i in 1:nrows){
if(data$data_channel_is_bus[i] == 1){
data$channel[i] = "Business"
}
else if(data$data_channel_is_entertainment[i] == 1){
data$channel[i] = "Entertainment"
}
else if(data$data_channel_is_lifestyle[i] == 1){
data$channel[i] = "Lifestyle"
}
else if(data$data_channel_is_socmed[i] == 1){
data$channel[i] = "Social-Media"
}
else if(data$data_channel_is_tech[i] == 1){
data$channel[i] = "Tech"
}
else if(data$data_channel_is_world[i] == 1){
data$channel[i] = "World"
}
else{ data$channel[i] ="Other"}
}Now I am going to get rid of the columns that are no longer useful as we have already collected all its information. Also I will remove a few columns that I consider that are not important for our purposes:
removeCols = function(colsVector, dataset){
ix <- which(colnames(dataset) %in% colsVector) #I obtain the number of a column name
return(dataset[, -ix]) # I am selecting the names of the variables i want to eliminate.
}
colsVector = c("weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday", "weekday_is_saturday", "weekday_is_sunday", "data_channel_is_world", "data_channel_is_tech", "data_channel_is_socmed", "data_channel_is_lifestyle", "data_channel_is_entertainment", "data_channel_is_bus", "url")
#Vector with the column names i decide to eliminate.
data = removeCols(colsVector, data) #I am using the function removeCols to delete them from the data.
ggcorr(data, label=F)## Warning in ggcorr(data, label = F): data in column(s) 'weekday', 'channel' are
## not numeric and were ignored
We can see that this is a really good data set as no missing values have been found and most of the data is quite relevant. Thanks to the ggcorr plot we can see also that the presented variables have no collinearity that for some supervised learning tools may be a good result. So, seeing the corrplot, there is no need to remove any quite correlated variables.
First, we need to create a categorical variable that we can predict, in order to do so, I have decided to create 3 categories: FewLikes, MidLikes and LotLikes, that will separate the data set in 3 parts, and the size of each part depend on the threshold I select. Let´s do it:
data_csf = data #I duplicate the data set and apply changes to it.
data_csf$shares = factor(ifelse(data_csf$shares <= 1000, "FewLikes", ifelse(data_csf$shares <= 2100, "MidLikes", "LotLikes")))
data_csf$shares <- factor(data_csf$shares, levels = c("FewLikes", "MidLikes", "LotLikes"))
levels(data_csf$shares)## [1] "FewLikes" "MidLikes" "LotLikes"
summary(data$shares)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 946 1400 3395 2800 843300
summary(data_csf$shares)## FewLikes MidLikes LotLikes
## 12424 14265 12955
a = length(which(data$shares <= 1000)); b = length(which(data$shares <= 2100)) ; c = length(which(data$shares > 2500))
a; b-a; c; sum(b + c)## [1] 12424
## [1] 14265
## [1] 10866
## [1] 37555
#This piece of code is just for checking that the partition has been done correctly.
#data_csf$shares = as.numeric(data_csf$shares)Here we have created a classification data set, grouping in the 3 categories using factor function, so that the number of shares is spread.
splitting_csf = createDataPartition(data_csf$shares, p = 0.8, list = FALSE)
dataTrain_csf = data_csf[splitting_csf,]
dataTest_csf = data_csf[-splitting_csf,]
y.train = data_csf[splitting_csf,]$shares
y.test = data_csf[-splitting_csf, ]$sharesWe have split the data into 3 parts. One for the Final testing as we have a lot of samples (20%). The other part (80%) has been divided into 2 different parts: Training and Testing with proportion of 70% and 30% respectively. Moreover I have split the data classification set to be able to compute classification apart from regression with this data set.
Also, we need to take into account that those profiles with a huge number of shares, ie, outliers, are going to be inside the LotLikes category, so, by the moment, we need to care because we are considering also the outliers for the classification set, we will deal in a few lines with the ones on regression, that are easier to deal with.
Now, I am going to try to see and understand, the classification data set where the only difference is that the classification has the predicted variable as a categorical one. Let´s see a bar plot with each of the levels:
ggplot(data=dataTrain_csf) + aes(x = shares) + geom_bar(fill = "darkgreen") + ggtitle("Total shares")p = ggplot(dataTrain_csf, aes(x=shares,fill = channel)) + geom_bar(); ggplotly(p)p = ggplot(dataTrain_csf, aes(x=channel,fill = shares)) + geom_bar(); ggplotly(p)p = ggplot(dataTrain_csf, aes(x=shares,fill = weekday)) + geom_bar(); ggplotly(p)p = ggplot(dataTrain_csf, aes(x=weekday,fill = shares)) + geom_bar(); ggplotly(p)data_csf$weekday = as.numeric(as.factor(data_csf$weekday))
data_csf$channel = as.numeric(as.factor(data_csf$channel))Here we can see the same 2 plots in a different way, I personally like more the first one as it is more visual. Here we can choose the day of the week and the channel of the media and see that each category is more or less spread all about them.
We are going to apply lot of tools in regression for previsualization, later on. ## Linear Discriminant Analysis; LDA
Now, I am creating just the model for the LDA:
model_lda = lda(shares ~ ., data=dataTrain_csf, prior = c(1/3, 1/3, 1/3)); model_lda## Warning in lda.default(x, grouping, ...): variables are collinear
## Call:
## lda(shares ~ ., data = dataTrain_csf, prior = c(1/3, 1/3, 1/3))
##
## Prior probabilities of groups:
## FewLikes MidLikes LotLikes
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes 345.6901 10.53028 517.7591 0.5410529
## MidLikes 350.8482 10.38854 545.8244 0.5297094
## LotLikes 365.4581 10.29670 573.9718 0.5890351
## n_non_stop_words n_non_stop_unique_tokens num_hrefs num_self_hrefs
## FewLikes 0.9742455 0.6861185 9.532696 3.027968
## MidLikes 0.9725727 0.6729847 10.720908 3.338766
## LotLikes 1.0637785 0.7228117 12.335295 3.407275
## num_imgs num_videos average_token_length num_keywords kw_min_min
## FewLikes 3.820121 1.210262 4.579605 7.069316 21.53169
## MidLikes 4.353049 1.183141 4.560364 7.209954 28.48589
## LotLikes 5.431880 1.347356 4.501834 7.395311 28.69973
## kw_max_min kw_avg_min kw_min_max kw_max_max kw_avg_max kw_min_avg
## FewLikes 1048.457 285.3763 11877.88 761038.8 255398.4 955.0905
## MidLikes 1103.878 307.4499 13457.96 748518.4 255362.8 1114.1738
## LotLikes 1300.295 341.7952 14739.67 747817.5 266309.3 1267.1326
## kw_max_avg kw_avg_avg self_reference_min_shares
## FewLikes 5161.108 2864.241 2816.275
## MidLikes 5371.899 3051.414 3747.274
## LotLikes 6344.925 3462.573 5590.497
## self_reference_max_shares self_reference_avg_sharess is_weekend
## FewLikes 7083.399 4461.428 0.05492958
## MidLikes 9883.619 6084.898 0.14887837
## LotLikes 14032.695 8823.294 0.18159012
## LDA_00 LDA_01 LDA_02 LDA_03 LDA_04 global_subjectivity
## FewLikes 0.1631171 0.1670079 0.2767935 0.2043417 0.1887398 0.4338808
## MidLikes 0.1897776 0.1356326 0.2152267 0.2020751 0.2572880 0.4427988
## LotLikes 0.2021912 0.1242539 0.1596759 0.2616896 0.2520929 0.4533785
## global_sentiment_polarity global_rate_positive_words
## FewLikes 0.1105689 0.03822679
## MidLikes 0.1214890 0.03975512
## LotLikes 0.1261304 0.04085099
## global_rate_negative_words rate_positive_words rate_negative_words
## FewLikes 0.01697033 0.6720537 0.3020912
## MidLikes 0.01633222 0.6877758 0.2846217
## LotLikes 0.01650785 0.6861405 0.2770011
## avg_positive_polarity min_positive_polarity max_positive_polarity
## FewLikes 0.3506987 0.09800050 0.7450717
## MidLikes 0.3536707 0.09507566 0.7563305
## LotLikes 0.3588554 0.09440743 0.7700489
## avg_negative_polarity min_negative_polarity max_negative_polarity
## FewLikes -0.2585912 -0.5205257 -0.1074286
## MidLikes -0.2557215 -0.5145062 -0.1069595
## LotLikes -0.2638996 -0.5307727 -0.1080979
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## FewLikes 0.2729206 0.05483263 0.3402783
## MidLikes 0.2749049 0.07156305 0.3414546
## LotLikes 0.3017967 0.08799310 0.3427411
## abs_title_sentiment_polarity weekdayMonday weekdaySaturday
## FewLikes 0.1460433 0.1792757 0.02323944
## MidLikes 0.1516071 0.1638626 0.06983877
## LotLikes 0.1695116 0.1604593 0.09002316
## weekdaySunday weekdayThursday weekdayTuesday weekdayWednesday
## FewLikes 0.03169014 0.2049296 0.2060362 0.2197183
## MidLikes 0.07903961 0.1809499 0.1850683 0.1735892
## LotLikes 0.09156696 0.1677924 0.1697221 0.1723273
## channelEntertainment channelLifestyle channelOther channelSocial-Media
## FewLikes 0.2386318 0.03812877 0.1162978 0.02022133
## MidLikes 0.1637750 0.05608132 0.1288118 0.06466877
## LotLikes 0.1324778 0.06348900 0.2171941 0.08722501
## channelTech channelWorld
## FewLikes 0.1237425 0.3011066
## MidLikes 0.2109183 0.2075009
## LotLikes 0.2170976 0.1358549
##
## Coefficients of linear discriminants:
## LD1 LD2
## timedelta 1.462389e-05 2.178162e-03
## n_tokens_title 2.234114e-03 1.794558e-02
## n_tokens_content 1.856687e-04 5.127276e-04
## n_unique_tokens -1.705518e-01 9.237163e-01
## n_non_stop_words 7.328564e-01 -1.397540e+00
## n_non_stop_unique_tokens -9.835254e-01 1.258208e+00
## num_hrefs 1.069591e-02 4.955552e-03
## num_self_hrefs -2.390610e-02 -2.903602e-02
## num_imgs 1.749186e-03 9.469290e-03
## num_videos -1.027378e-04 -1.735870e-02
## average_token_length -1.529912e-01 -4.644799e-01
## num_keywords 4.665224e-02 3.078499e-02
## kw_min_min 2.385898e-03 -6.159271e-03
## kw_max_min 4.721541e-05 7.136488e-05
## kw_avg_min -3.139828e-04 -5.722489e-04
## kw_min_max -5.235777e-07 -1.563456e-07
## kw_max_max -3.378508e-07 -6.005659e-07
## kw_avg_max -6.953823e-07 -6.371711e-07
## kw_min_avg -1.095410e-04 -2.205608e-04
## kw_max_avg -1.259629e-04 -6.721628e-05
## kw_avg_avg 9.623514e-04 6.816364e-04
## self_reference_min_shares 4.766393e-08 5.654143e-06
## self_reference_max_shares 7.561956e-07 1.388984e-06
## self_reference_avg_sharess 3.443757e-06 -2.798286e-06
## is_weekend 4.698710e-01 -3.700727e-01
## LDA_00 9.177644e-01 1.109198e+00
## LDA_01 -4.472565e-01 1.511612e-01
## LDA_02 -6.551381e-01 1.195464e-01
## LDA_03 -3.621915e-01 2.039814e-01
## LDA_04 1.847968e-01 -7.854685e-01
## global_subjectivity 9.145843e-01 -3.963933e-01
## global_sentiment_polarity -3.360282e-01 -8.746186e-01
## global_rate_positive_words -1.347302e+00 5.001473e+00
## global_rate_negative_words 3.995810e+00 -7.193399e+00
## rate_positive_words 3.619145e-01 1.687126e+00
## rate_negative_words -3.106400e-01 2.157279e+00
## avg_positive_polarity 1.614009e-01 7.037249e-02
## min_positive_polarity -4.482421e-01 3.840755e-01
## max_positive_polarity -1.960804e-01 1.680489e-01
## avg_negative_polarity -5.083738e-01 -8.376589e-01
## min_negative_polarity 7.656287e-02 6.527369e-02
## max_negative_polarity 3.156968e-01 6.501577e-01
## title_subjectivity 1.749175e-01 3.826049e-01
## title_sentiment_polarity 2.176893e-01 1.477612e-01
## abs_title_subjectivity 3.853358e-01 4.507512e-01
## abs_title_sentiment_polarity -3.731572e-02 -4.871748e-02
## weekdayMonday -1.709968e-01 3.505963e-01
## weekdaySaturday 5.010216e-01 -1.753058e-01
## weekdaySunday 3.682952e-01 -4.894109e-01
## weekdayThursday -3.329585e-01 2.812499e-01
## weekdayTuesday -3.405809e-01 2.318281e-01
## weekdayWednesday -3.804344e-01 6.741112e-01
## channelEntertainment -1.001509e-01 1.068488e+00
## channelLifestyle 2.972539e-01 5.490102e-01
## channelOther 3.714482e-01 1.374427e+00
## channelSocial-Media 1.371061e+00 -3.747689e-02
## channelTech 1.027005e+00 6.447557e-01
## channelWorld 3.179010e-01 1.309428e+00
##
## Proportion of trace:
## LD1 LD2
## 0.9314 0.0686
model_lda = lda(shares ~ ., data=dataTrain_csf); model_lda## Warning in lda.default(x, grouping, ...): variables are collinear
## Call:
## lda(shares ~ ., data = dataTrain_csf)
##
## Prior probabilities of groups:
## FewLikes MidLikes LotLikes
## 0.3134065 0.3598184 0.3267751
##
## Group means:
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes 345.6901 10.53028 517.7591 0.5410529
## MidLikes 350.8482 10.38854 545.8244 0.5297094
## LotLikes 365.4581 10.29670 573.9718 0.5890351
## n_non_stop_words n_non_stop_unique_tokens num_hrefs num_self_hrefs
## FewLikes 0.9742455 0.6861185 9.532696 3.027968
## MidLikes 0.9725727 0.6729847 10.720908 3.338766
## LotLikes 1.0637785 0.7228117 12.335295 3.407275
## num_imgs num_videos average_token_length num_keywords kw_min_min
## FewLikes 3.820121 1.210262 4.579605 7.069316 21.53169
## MidLikes 4.353049 1.183141 4.560364 7.209954 28.48589
## LotLikes 5.431880 1.347356 4.501834 7.395311 28.69973
## kw_max_min kw_avg_min kw_min_max kw_max_max kw_avg_max kw_min_avg
## FewLikes 1048.457 285.3763 11877.88 761038.8 255398.4 955.0905
## MidLikes 1103.878 307.4499 13457.96 748518.4 255362.8 1114.1738
## LotLikes 1300.295 341.7952 14739.67 747817.5 266309.3 1267.1326
## kw_max_avg kw_avg_avg self_reference_min_shares
## FewLikes 5161.108 2864.241 2816.275
## MidLikes 5371.899 3051.414 3747.274
## LotLikes 6344.925 3462.573 5590.497
## self_reference_max_shares self_reference_avg_sharess is_weekend
## FewLikes 7083.399 4461.428 0.05492958
## MidLikes 9883.619 6084.898 0.14887837
## LotLikes 14032.695 8823.294 0.18159012
## LDA_00 LDA_01 LDA_02 LDA_03 LDA_04 global_subjectivity
## FewLikes 0.1631171 0.1670079 0.2767935 0.2043417 0.1887398 0.4338808
## MidLikes 0.1897776 0.1356326 0.2152267 0.2020751 0.2572880 0.4427988
## LotLikes 0.2021912 0.1242539 0.1596759 0.2616896 0.2520929 0.4533785
## global_sentiment_polarity global_rate_positive_words
## FewLikes 0.1105689 0.03822679
## MidLikes 0.1214890 0.03975512
## LotLikes 0.1261304 0.04085099
## global_rate_negative_words rate_positive_words rate_negative_words
## FewLikes 0.01697033 0.6720537 0.3020912
## MidLikes 0.01633222 0.6877758 0.2846217
## LotLikes 0.01650785 0.6861405 0.2770011
## avg_positive_polarity min_positive_polarity max_positive_polarity
## FewLikes 0.3506987 0.09800050 0.7450717
## MidLikes 0.3536707 0.09507566 0.7563305
## LotLikes 0.3588554 0.09440743 0.7700489
## avg_negative_polarity min_negative_polarity max_negative_polarity
## FewLikes -0.2585912 -0.5205257 -0.1074286
## MidLikes -0.2557215 -0.5145062 -0.1069595
## LotLikes -0.2638996 -0.5307727 -0.1080979
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## FewLikes 0.2729206 0.05483263 0.3402783
## MidLikes 0.2749049 0.07156305 0.3414546
## LotLikes 0.3017967 0.08799310 0.3427411
## abs_title_sentiment_polarity weekdayMonday weekdaySaturday
## FewLikes 0.1460433 0.1792757 0.02323944
## MidLikes 0.1516071 0.1638626 0.06983877
## LotLikes 0.1695116 0.1604593 0.09002316
## weekdaySunday weekdayThursday weekdayTuesday weekdayWednesday
## FewLikes 0.03169014 0.2049296 0.2060362 0.2197183
## MidLikes 0.07903961 0.1809499 0.1850683 0.1735892
## LotLikes 0.09156696 0.1677924 0.1697221 0.1723273
## channelEntertainment channelLifestyle channelOther channelSocial-Media
## FewLikes 0.2386318 0.03812877 0.1162978 0.02022133
## MidLikes 0.1637750 0.05608132 0.1288118 0.06466877
## LotLikes 0.1324778 0.06348900 0.2171941 0.08722501
## channelTech channelWorld
## FewLikes 0.1237425 0.3011066
## MidLikes 0.2109183 0.2075009
## LotLikes 0.2170976 0.1358549
##
## Coefficients of linear discriminants:
## LD1 LD2
## timedelta 1.975240e-05 2.178121e-03
## n_tokens_title 2.276361e-03 1.794027e-02
## n_tokens_content 1.868754e-04 5.122890e-04
## n_unique_tokens -1.683764e-01 9.241153e-01
## n_non_stop_words 7.295638e-01 -1.399262e+00
## n_non_stop_unique_tokens -9.805602e-01 1.260520e+00
## num_hrefs 1.070754e-02 4.930354e-03
## num_self_hrefs -2.397440e-02 -2.897965e-02
## num_imgs 1.771477e-03 9.465145e-03
## num_videos -1.436091e-04 -1.735841e-02
## average_token_length -1.540844e-01 -4.641184e-01
## num_keywords 4.672459e-02 3.067506e-02
## kw_min_min 2.371389e-03 -6.164872e-03
## kw_max_min 4.738331e-05 7.125351e-05
## kw_avg_min -3.153293e-04 -5.715080e-04
## kw_min_max -5.239444e-07 -1.551124e-07
## kw_max_max -3.392639e-07 -5.997688e-07
## kw_avg_max -6.968806e-07 -6.355320e-07
## kw_min_avg -1.100600e-04 -2.203022e-04
## kw_max_avg -1.261208e-04 -6.691951e-05
## kw_avg_avg 9.639537e-04 6.793686e-04
## self_reference_min_shares 6.097665e-08 5.654015e-06
## self_reference_max_shares 7.594639e-07 1.387199e-06
## self_reference_avg_sharess 3.437159e-06 -2.806387e-06
## is_weekend 4.689983e-01 -3.711780e-01
## LDA_00 9.203735e-01 1.107034e+00
## LDA_01 -4.468994e-01 1.522139e-01
## LDA_02 -6.548548e-01 1.210886e-01
## LDA_03 -3.617102e-01 2.048337e-01
## LDA_04 1.829469e-01 -7.859014e-01
## global_subjectivity 9.136484e-01 -3.985456e-01
## global_sentiment_polarity -3.380866e-01 -8.738250e-01
## global_rate_positive_words -1.335522e+00 5.004632e+00
## global_rate_negative_words 3.978861e+00 -7.202787e+00
## rate_positive_words 3.658858e-01 1.686269e+00
## rate_negative_words -3.055598e-01 2.158004e+00
## avg_positive_polarity 1.615662e-01 6.999228e-02
## min_positive_polarity -4.473366e-01 3.851298e-01
## max_positive_polarity -1.956842e-01 1.685101e-01
## avg_negative_polarity -5.103446e-01 -8.364596e-01
## min_negative_polarity 7.671634e-02 6.509324e-02
## max_negative_polarity 3.172267e-01 6.494126e-01
## title_subjectivity 1.758179e-01 3.821920e-01
## title_sentiment_polarity 2.180366e-01 1.472483e-01
## abs_title_subjectivity 3.863961e-01 4.498426e-01
## abs_title_sentiment_polarity -3.743032e-02 -4.862949e-02
## weekdayMonday -1.701708e-01 3.509980e-01
## weekdaySaturday 5.006074e-01 -1.764850e-01
## weekdaySunday 3.671418e-01 -4.902767e-01
## weekdayThursday -3.322954e-01 2.820331e-01
## weekdayTuesday -3.400341e-01 2.326293e-01
## weekdayWednesday -3.788462e-01 6.750051e-01
## channelEntertainment -9.763482e-02 1.068721e+00
## channelLifestyle 2.985458e-01 5.483088e-01
## channelOther 3.746833e-01 1.373549e+00
## channelSocial-Media 1.370969e+00 -4.070499e-02
## channelTech 1.028520e+00 6.423358e-01
## channelWorld 3.209832e-01 1.308676e+00
##
## Proportion of trace:
## LD1 LD2
## 0.9262 0.0738
Variable prior is not mandatory but it is important because introduces 3 probabilities, with the proportion of shares pi1 = Few, pi2 = Mid, pi3 = Lot; Prior is the proportion of shares in each category, we are just guessing by checking on the data set. We are finally removing it, as for this data set, the proportion is more or less the same. These are the hyper parameters because they belong to the future that you don´t know.
Now we are going to predict and compute the future probabilities:
probability = predict(model_lda, newdata=dataTest_csf)$posterior
head(probability)## FewLikes MidLikes LotLikes
## 3 0.5489891 0.3494641 0.10154681
## 7 0.5421282 0.3502987 0.10757309
## 13 0.5994757 0.2579056 0.14261865
## 14 0.7389681 0.1914193 0.06961263
## 18 0.6042526 0.2881476 0.10759982
## 20 0.4388105 0.3963282 0.16486122
As we are using machine learning to predict and this is a bayesian tool; the output is a probability, choosing the maximum probability is mathematically perfect, but if we were a company this will be very expensive. As we are not, let´s do it:
prediction = max.col(probability); head(prediction) ## [1] 1 1 1 1 1 1
prediction = predict(model_lda, newdata=dataTest_csf)$class; head(prediction)## [1] FewLikes FewLikes FewLikes FewLikes FewLikes FewLikes
## Levels: FewLikes MidLikes LotLikes
We have chosen the column with highest probability, and as we obtain the split in numbers and we want categories, we use the testing set.
The confusion matrix: predictions in rows, true values in columns (but we can change the order) For us to know the performance of the LDA model, we need to test it and check the so called confusion matrix, that is the matrix that contains the predictions in the Testing set, the true values.
confusionMatrix(prediction, dataTest_csf$shares)## Confusion Matrix and Statistics
##
## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 1424 903 554
## MidLikes 678 1078 799
## LotLikes 382 872 1238
##
## Overall Statistics
##
## Accuracy : 0.4717
## 95% CI : (0.4607, 0.4828)
## No Information Rate : 0.3599
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2085
##
## Mcnemar's Test P-Value : 2.05e-14
##
## Statistics by Class:
##
## Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity 0.5733 0.3778 0.4778
## Specificity 0.7324 0.7090 0.7650
## Pos Pred Value 0.4943 0.4219 0.4968
## Neg Pred Value 0.7900 0.6696 0.7511
## Prevalence 0.3133 0.3599 0.3268
## Detection Rate 0.1796 0.1360 0.1562
## Detection Prevalence 0.3634 0.3223 0.3143
## Balanced Accuracy 0.6528 0.5434 0.6214
confusionMatrix(prediction, dataTest_csf$shares)$table## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 1424 903 554
## MidLikes 678 1078 799
## LotLikes 382 872 1238
confusionMatrix(prediction, dataTest_csf$shares)$overall[1]## Accuracy
## 0.4717457
If we were doing regression, we would compute the R square (will do it later), but as we are not, we are going to obtain the confusion error. We can see that the kappa is around 0.19 and the accuracy is around 48%, which is not bad at all because this model was so easy to compute and cheap but it is definitely not accurate.
It is important to take into account that for been an LDA model, the accuracy is quite high due to the fact that the data split is highly balanced. Nevertheless, I am going to try a different approach.
Let´s apply same tools as in LDA but with a different model approach for the moment: First, I need to re-split
summary(dataTrain_csf$shares)## FewLikes MidLikes LotLikes
## 9940 11412 10364
splitting_qda = createDataPartition(data_csf$shares, p = 0.85, list = FALSE)
dataTrain_qda = data_csf[splitting_qda,]
dataTest_qda = data_csf[-splitting_qda,]
summary(dataTrain_qda$shares)## FewLikes MidLikes LotLikes
## 10561 12126 11012
dataTrain_qda = dataTrain_qda[,-c(6,7,8,9,10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27)]
#I am removing the less possible variables just to avoid collinearity in the model
model_qda <- qda(shares ~ ., data=dataTrain_qda); model_qda## Call:
## qda(shares ~ ., data = dataTrain_qda)
##
## Prior probabilities of groups:
## FewLikes MidLikes LotLikes
## 0.3133921 0.3598326 0.3267753
##
## Group means:
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes 347.5032 10.52457 519.0209 0.5410393
## MidLikes 350.2134 10.39048 548.7354 0.5288285
## LotLikes 366.6715 10.30104 574.5031 0.5222045
## n_non_stop_words average_token_length num_keywords LDA_02 LDA_03
## FewLikes 0.9744342 4.582809 7.063062 0.2766769 0.2071301
## MidLikes 0.9720435 4.557338 7.208972 0.2148112 0.2044536
## LotLikes 0.9646749 4.507581 7.396749 0.1585813 0.2637528
## LDA_04 global_subjectivity global_sentiment_polarity
## FewLikes 0.1891222 0.4333829 0.1099478
## MidLikes 0.2552931 0.4433531 0.1218988
## LotLikes 0.2513853 0.4545307 0.1261176
## global_rate_positive_words global_rate_negative_words
## FewLikes 0.03818540 0.01710310
## MidLikes 0.03989204 0.01637471
## LotLikes 0.04079279 0.01653743
## rate_positive_words rate_negative_words avg_positive_polarity
## FewLikes 0.6705364 0.3038031 0.3504094
## MidLikes 0.6876688 0.2842098 0.3534693
## LotLikes 0.6866771 0.2779070 0.3588756
## min_positive_polarity max_positive_polarity avg_negative_polarity
## FewLikes 0.09815644 0.7440926 -0.2587443
## MidLikes 0.09463095 0.7575850 -0.2560930
## LotLikes 0.09392458 0.7703931 -0.2648767
## min_negative_polarity max_negative_polarity title_subjectivity
## FewLikes -0.5210734 -0.1069238 0.2677329
## MidLikes -0.5161442 -0.1064693 0.2746782
## LotLikes -0.5321845 -0.1089502 0.3019028
## title_sentiment_polarity abs_title_subjectivity
## FewLikes 0.05377237 0.3404780
## MidLikes 0.07024472 0.3421620
## LotLikes 0.08814128 0.3427182
## abs_title_sentiment_polarity weekday channel
## FewLikes 0.1439285 4.475144 4.164947
## MidLikes 0.1511973 4.234290 4.208808
## LotLikes 0.1711601 4.177534 4.145932
We can find many issues with this model. The minimum size on each category so that the model works is 10000 rows, that´s the reason why I have used 90% for the training test just for this approach, depending on how the split is done, this piece of code could result in a error, rank deficiency, cause by what I have just explained. Also, collinearity breaks this model, so it is important to have just a few important columns. We are going to choose the main principal ones.
This is one of the reasons why reducing noise is so important. I am keeping many of the columns just because in some models overfitting is something good to predict, but not to understand what is going on.
Performance:
prediction = predict(model_qda, newdata=dataTest_qda)$class
confusionMatrix(prediction, dataTest_qda$shares)## Confusion Matrix and Statistics
##
## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 1084 942 616
## MidLikes 310 559 519
## LotLikes 469 638 808
##
## Overall Statistics
##
## Accuracy : 0.4123
## 95% CI : (0.3997, 0.4249)
## No Information Rate : 0.3598
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1247
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity 0.5819 0.26134 0.4159
## Specificity 0.6183 0.78219 0.7234
## Pos Pred Value 0.4103 0.40274 0.4219
## Neg Pred Value 0.7642 0.65328 0.7184
## Prevalence 0.3134 0.35980 0.3268
## Detection Rate 0.1823 0.09403 0.1359
## Detection Prevalence 0.4444 0.23347 0.3221
## Balanced Accuracy 0.6001 0.52176 0.5696
confusionMatrix(prediction, dataTest_qda$shares)$table## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 1084 942 616
## MidLikes 310 559 519
## LotLikes 469 638 808
confusionMatrix(prediction, dataTest_qda$shares)$overall[1]## Accuracy
## 0.4122792
Accuracy seems to be so poor now, kappa has been reduced to 0.1 and accuracy does not achieve the 39%. No improvement with respect to LDA.
What is the accuracy on a benchmark model that predicts the most frequent outcome (No Delay) for all observations? Benchmark models basically predict on a basis of what is the most frequent output, it is a cheap choice, so let´s try it:
table(dataTrain_csf$shares)##
## FewLikes MidLikes LotLikes
## 9940 11412 10364
obs = max(table(dataTest_csf$shares))
accuracy = obs/nrow(dataTest_csf); accuracy## [1] 0.3598638
Accuracy rounds 36%, that is good if we compare it with the 39% of the QDA and also compare the time consumed. It is not so usefull to predict as 2 of each 3 times, the prediction will be wrong.
We are obtaining pretty low accuracy results, I am going to try other different models and aproaches that will provide us better results, let´s use them:
As we know, in naive bayes, there is Standard and Gaussian classification, where the first assumes that the predictor variables are independent and the second, the Gaussian, does not assume and it is given the target class. I am going first with the Gaussian approach:
model_nb0 = naiveBayes(as.matrix(dataTrain_csf), y.train, laplace = 0.6) # laplace controls smoothing of probabilities
prediction = predict(model_nb0, as.matrix(dataTest_csf)) We are assuming that the columns are independent and there is going to be a very low varianciace but bias is huge.
Let´s see the results in the confusion Matrix:
confusionMatrix(prediction, y.test)## Confusion Matrix and Statistics
##
## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 2478 14 19
## MidLikes 1 2839 4
## LotLikes 5 0 2568
##
## Overall Statistics
##
## Accuracy : 0.9946
## 95% CI : (0.9927, 0.9961)
## No Information Rate : 0.3599
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9919
##
## Mcnemar's Test P-Value : 3.28e-05
##
## Statistics by Class:
##
## Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity 0.9976 0.9951 0.9911
## Specificity 0.9939 0.9990 0.9991
## Pos Pred Value 0.9869 0.9982 0.9981
## Neg Pred Value 0.9989 0.9972 0.9957
## Prevalence 0.3133 0.3599 0.3268
## Detection Rate 0.3126 0.3581 0.3239
## Detection Prevalence 0.3167 0.3587 0.3245
## Balanced Accuracy 0.9958 0.9971 0.9951
confusionMatrix(prediction, y.test)$overall[1]## Accuracy
## 0.9945762
Accuracy seems to be 99.4% what is an insane result. But kappa is also too high, 98%. The fact that the accuracy is that high makes me suspicious and not to trust that much that value, but it seems to be a really good result, just for comparison, let´s try the probabilities as an output:
probability = predict(model_nb0, as.matrix(dataTest_csf),type="raw")
head(probability)## FewLikes MidLikes LotLikes
## [1,] 1.383183e-04 9.998558e-01 5.919158e-06
## [2,] 9.999964e-01 1.138425e-06 2.440958e-06
## [3,] 9.999346e-01 9.226965e-07 6.445266e-05
## [4,] 3.349156e-05 1.883209e-06 9.999646e-01
## [5,] 2.160561e-03 5.608863e-05 9.977834e-01
## [6,] 3.355196e-04 1.143374e-05 9.996530e-01
hist(probability) This values are quite extreme (most close to 0 or 1) so need to take care with them also.
We need to check kappa, that is the accuracy but discounting the accuracy of a naive classifier, your tool must have an accuracy greater than the Kappa number, that is the expected accuracy since a naive approach, a really basic one. We can see that this tool as it is providing a 99% is really good.
As kappa is quite high, we can expect the accuracy to be also really high.
I have also tried to compute Bernoulli naive-bayes approximation and multinomial but I have not been capable of computing them due to many different errors. Let´s use more advanced tools:
Thanks to caret, a package in R, we can try more than 200 different models in an easy way, using cross validation (cv) and multiple core processing. We will use now caret a lot, but let´s start by using it to predict with naive Bayes:
Firstly, here we define the parameters that are going to go in the control variable:
ctrl = trainControl(method = "repeatedcv", # Here we can use bootstrap, LOOCV and many others.
repeats = 2, # repeat cross-validation 5 times just to be sure.
number = 3, # kfolds
allowParallel = TRUE,
verboseIter = T)We must take care as caret does not have implemented the multinomial naive Bayes. So, instead of 3 categories, we are going to do just binomial, this is why we have transformed our data set into binary:
trctrl = trainControl(method = "none")
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
nb_grid = expand.grid(usekernel = c(TRUE, FALSE),
laplace = seq(0, 1, 0.1),
adjust = seq(0.5, 1.5, 0.1))
nb_mod = train(x = dataTrain_csf, #train means estimate, fit or optimize, is all the same. X is where is training set
y = y.train,
method = "naive_bayes", # This is the only thing that changes. You just type the name of the machine learning tool .
trControl = ctrl, # control can be: do or do not do cross-validation (cv) for instance.
tuneGrid = nb_grid,
num.threads = (detectCores() - 1))## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0.1, usekernel = TRUE, adjust = 1.5 on full training set
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
## Warning: In density.default(x, na.rm = TRUE, ...) :
## extra argument 'num.threads' will be disregarded
nb_pred = predict(nb_mod,
newdata = dataTest_csf) # This piece of code is always the same for all the models.
confusionMatrix(nb_pred,y.test)## Confusion Matrix and Statistics
##
## Reference
## Prediction FewLikes MidLikes LotLikes
## FewLikes 2469 1 7
## MidLikes 2 2816 2
## LotLikes 13 36 2582
##
## Overall Statistics
##
## Accuracy : 0.9923
## 95% CI : (0.9901, 0.9941)
## No Information Rate : 0.3599
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9884
##
## Mcnemar's Test P-Value : 3.999e-07
##
## Statistics by Class:
##
## Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity 0.9940 0.9870 0.9965
## Specificity 0.9985 0.9992 0.9908
## Pos Pred Value 0.9968 0.9986 0.9814
## Neg Pred Value 0.9972 0.9928 0.9983
## Prevalence 0.3133 0.3599 0.3268
## Detection Rate 0.3114 0.3552 0.3257
## Detection Prevalence 0.3124 0.3557 0.3319
## Balanced Accuracy 0.9962 0.9931 0.9937
confusionMatrix(nb_pred,y.test)$overall[1]## Accuracy
## 0.9923058
# plot(nb_mod)
toc()## 90.11 sec elapsed
endParallel(cl)## [1] "The model has finished."
plot(nb_mod)plot(confusionMatrix(nb_pred,y.test)[["table"]])Aggregating results
Selecting tuning parameters
Fitting laplace = 0.1, usekernel = TRUE, adjust = 1.5 on full training set
So, as a result, we can see that the accuracy as before is extremely high, because it is around 99.2%. We have use by first time in this project, caret package, using a training control, doing cross validation and different hyper parameters, but we did not obtain a so high accuracy with that, in comparison with the previous result in naive bayes, but we need to take into account that the accuracy is more than 99%, so improve would be almost impossible.
In order to use logistic regression with the glm package, our categorical target must be binomial, something like yes or no, so we need to transform the Low, Mid and High number of shares into Low and High number of shares. The problem of this in relation with our data is that we will get only information about if the number of shares is going to be above or below our threshold (of 1400) so it is not going to be so usefull, despite this, we are using it:
data_csf_bin = data
data_csf_bin$shares = factor(ifelse(data_csf_bin$shares <= 1400, "FewLikes", "LotLikes"))
splitting_csf_bin = createDataPartition(data_csf_bin$shares, p = 0.8, list = FALSE)
dataTrain_csf_bin = data_csf_bin[splitting_csf_bin,]
dataTest_csf_bin = data_csf_bin[-splitting_csf_bin,]Now I am creating the Logistic Regression model:
model_lgr = glm(shares ~., family = binomial(link="logit"),data=dataTrain_csf_bin); summary(model_lgr) ##
## Call:
## glm(formula = shares ~ ., family = binomial(link = "logit"),
## data = dataTrain_csf_bin)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7319 -1.0349 -0.6274 1.0700 2.1890
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.592e+02 1.334e+03 0.344 0.730573
## timedelta 1.129e-04 8.232e-05 1.372 0.170066
## n_tokens_title -2.074e-03 6.048e-03 -0.343 0.731679
## n_tokens_content 1.331e-04 4.813e-05 2.766 0.005681 **
## n_unique_tokens -2.519e-01 4.018e-01 -0.627 0.530626
## n_non_stop_words 1.421e-02 1.283e+00 0.011 0.991159
## n_non_stop_unique_tokens -4.438e-01 3.403e-01 -1.304 0.192211
## num_hrefs 7.731e-03 1.466e-03 5.275 1.33e-07 ***
## num_self_hrefs -2.478e-02 3.815e-03 -6.494 8.35e-11 ***
## num_imgs 2.401e-03 1.911e-03 1.256 0.209010
## num_videos 2.840e-03 3.336e-03 0.851 0.394548
## average_token_length -1.266e-01 5.040e-02 -2.511 0.012027 *
## num_keywords 4.206e-02 7.772e-03 5.412 6.23e-08 ***
## kw_min_min 1.664e-03 3.451e-04 4.823 1.42e-06 ***
## kw_max_min 3.487e-05 1.589e-05 2.194 0.028207 *
## kw_avg_min -2.703e-04 1.032e-04 -2.619 0.008812 **
## kw_min_max -4.408e-07 2.409e-07 -1.830 0.067246 .
## kw_max_max -3.938e-07 1.241e-07 -3.172 0.001514 **
## kw_avg_max -5.160e-07 1.782e-07 -2.896 0.003779 **
## kw_min_avg -9.007e-05 1.613e-05 -5.585 2.34e-08 ***
## kw_max_avg -8.599e-05 5.549e-06 -15.496 < 2e-16 ***
## kw_avg_avg 7.127e-04 3.223e-05 22.110 < 2e-16 ***
## self_reference_min_shares -8.963e-07 1.960e-06 -0.457 0.647513
## self_reference_max_shares 6.542e-08 9.561e-07 0.068 0.945448
## self_reference_avg_sharess 4.499e-06 2.552e-06 1.763 0.077942 .
## is_weekend 5.150e-01 5.702e-02 9.032 < 2e-16 ***
## LDA_00 -4.603e+02 1.334e+03 -0.345 0.729960
## LDA_01 -4.614e+02 1.334e+03 -0.346 0.729338
## LDA_02 -4.616e+02 1.334e+03 -0.346 0.729226
## LDA_03 -4.614e+02 1.334e+03 -0.346 0.729315
## LDA_04 -4.609e+02 1.334e+03 -0.346 0.729616
## global_subjectivity 9.171e-01 1.771e-01 5.179 2.23e-07 ***
## global_sentiment_polarity -1.249e-02 3.492e-01 -0.036 0.971461
## global_rate_positive_words -2.295e+00 1.483e+00 -1.548 0.121716
## global_rate_negative_words 1.536e+00 2.881e+00 0.533 0.593948
## rate_positive_words 7.927e-01 1.255e+00 0.631 0.527783
## rate_negative_words 5.077e-01 1.265e+00 0.401 0.688222
## avg_positive_polarity -3.372e-01 2.858e-01 -1.180 0.238035
## min_positive_polarity -3.690e-01 2.377e-01 -1.553 0.120501
## max_positive_polarity -1.906e-02 8.959e-02 -0.213 0.831499
## avg_negative_polarity -5.242e-02 2.635e-01 -0.199 0.842287
## min_negative_polarity 3.333e-02 9.633e-02 0.346 0.729390
## max_negative_polarity 4.258e-02 2.169e-01 0.196 0.844354
## title_subjectivity 1.871e-01 5.712e-02 3.275 0.001057 **
## title_sentiment_polarity 2.244e-01 5.230e-02 4.291 1.78e-05 ***
## abs_title_subjectivity 3.463e-01 7.607e-02 4.553 5.29e-06 ***
## abs_title_sentiment_polarity -4.949e-02 8.256e-02 -0.599 0.548859
## weekdayMonday -1.296e-01 4.284e-02 -3.025 0.002486 **
## weekdaySaturday 2.611e-01 7.037e-02 3.710 0.000207 ***
## weekdaySunday NA NA NA NA
## weekdayThursday -2.050e-01 4.204e-02 -4.876 1.08e-06 ***
## weekdayTuesday -2.708e-01 4.194e-02 -6.457 1.07e-10 ***
## weekdayWednesday -2.847e-01 4.197e-02 -6.784 1.17e-11 ***
## channelEntertainment -3.785e-02 7.634e-02 -0.496 0.620077
## channelLifestyle 1.209e-01 7.763e-02 1.558 0.119305
## channelOther 2.582e-01 8.013e-02 3.222 0.001272 **
## channelSocial-Media 1.065e+00 6.964e-02 15.293 < 2e-16 ***
## channelTech 7.422e-01 6.941e-02 10.692 < 2e-16 ***
## channelWorld 2.958e-01 7.477e-02 3.955 7.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43962 on 31715 degrees of freedom
## Residual deviance: 39811 on 31658 degrees of freedom
## AIC: 39927
##
## Number of Fisher Scoring iterations: 8
In logistic regression, as before, we can predict using probabilities:
probability = predict(model_lgr, newdata = dataTest_csf_bin, type='response')## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(probability)## 2 5 9 11 13 15
## 0.2196621 0.3118688 0.3353655 0.1984133 0.2535278 0.1826584
prop = seq(0.1, 1, 0.1)
accuracy = expand.grid(prop, 0)
for(i in 1:length(prop)){
prediction1 = as.factor(ifelse(probability > prop[i], "FewLikes", "LotLikes"))
accuracy[i, 2] = confusionMatrix(prediction1, dataTest_csf_bin$shares)$overall[1]
}## Warning in confusionMatrix.default(prediction1, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
accuracy_f = accuracy[which.max(accuracy[,2]),]; accuracy_f## Var1 Var2
## 1 0.1 0.5064329
> accuracy_f = accuracy[which.max(accuracy[,2]),]; accuracy_f
Prob Acc
1 0.1 0.506559
So, up to now, we have been capable of getting a 35% accuracy while saying if a post will have a few likes or a lot of likes, something that should be easy, but the results seem to be not so good at all since kappa also rounds 30% too.
But, when we tune the probability as a hyperparameter we can see that for probability greater than 0.1 we are getting an accuracy of 50.6%, quite better that for the 50% in p. This is because we are saying that being in the category few likes is going be done if p is greater than 0.1 what classifies most of them and improves accuracy, that´s why if we choose p to be zero, it increases more, but this number is not useful/realistic for the number of shares that a post will receive.
When the dimension is high, it is better to use a penalized version. The most known is glmnet or elasticnet.
prop = seq(0.1, 1, 0.1)
alpha = seq(0, 0.1, 0.01)
lambda = seq(0, 0.01, 0.001)
accuracy = expand.grid(prop, alpha, lambda, 0)
for(i in 1:nrow(accuracy)){
model_logit = glmnet((dataTrain_csf_bin[,-40]),dataTrain_csf_bin$shares,
family=c("binomial"), alpha=accuracy[i,2], lambda=accuracy[i,3])
probability2 = predict(model_logit, (data.matrix(dataTest_csf_bin[,-40])), type='response')
prediction2 = as.factor(ifelse(probability2 > accuracy[i,1], "FewLikes", "LotLikes"))
accuracy[i, 4] = confusionMatrix(prediction2, dataTest_csf_bin$shares)$overall[1]
}## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
accuracy_f = accuracy[which.max(accuracy[,4]),]; accuracy_f## Var1 Var2 Var3 Var4
## 10 1 0 0 0.493441
> Var1 Var2 Var3 Var4
> 621 0.1 0.07 0.005 0.5066852
We are obtaining that the best parameters are probability = 0.1, alpha = 0.07, lambda = 0.005 and and accuracy of 50%. We get this values as before, when we set the probability as the lowest one. In comparion with the naive bayes approach, this method is definitely not valid for this data set. If we consider that we get 50% because of choosing the lowest probability, this afirmation will be even stronger.
model = lda(shares ~ ., data=dataTrain_csf_bin, prior = c(.1, .9))## Warning in lda.default(x, grouping, ...): variables are collinear
AS I want that the ones with higher shares represent the highest part of the sample, I set the category HighShares to 90%, so the threshold will be high
I am getting the probabilities and computing the ROC Curve:
probability = predict(model, dataTest_csf_bin)$posterior
roc.lda <- roc(dataTest_csf_bin$shares,probability[,2])## Setting levels: control = FewLikes, case = LotLikes
## Setting direction: controls < cases
auc(roc.lda) ## Area under the curve: 0.7
plot.roc(dataTest_csf_bin$shares, probability[,2],col="darkblue", print.auc = TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
grid.col=c("green", "red"), max.auc.polygon=TRUE,
auc.polygon.col="lightblue", print.thres=TRUE)## Setting levels: control = FewLikes, case = LotLikes
## Setting direction: controls < cases
We know that the closer the curve to the top left corner, the better. The optimal threshold is also provided, 2.6% for this case. The area under the curve is 0.7022, so 70% and the values for the specificity are 0.652 and 0.63 for the sensitivity.
Remember that the sensitivity is the positive rate and 1 minus the false positive rate is the specificity. The threshold is 89%, not so good.
We are going to analyze how good it would be for us if we were a company that wants to promote a post and get as much shares as possible; ie, optimize their own publicity and know when will be better to publish our advertisements, this is why we need to add a cost.
Assume the following (company’s data):
Cost of true negatives is 0: the desired number of likes is achieved.
Cost of false negatives is 1000: This means that we think that our post is going to be seeing, but it is not and we are going to lose much money. For each bad advertisement we pay 1000 dolars.
Cost of false positives is 200: If our post is obtaining the desired shares, we increase a bit the money designated to advertisement so that we continue.
Cost of true positives is 140: (1 - gamma) * 1000 + gamma * 200 = 150 + 170 = 320 where gamma = probability an advert is well positioned, 0.85 in our case
Cost matrix:
| Prediction/Reference | no | yes |
|---|---|---|
| predicted no | 0 | 1000 |
| predicted yes | 200 | 320 |
cost.unit <- c(0, 200, 1000, 320)Unit cost for Naive classifier (no analytics knowledge)
cost = 0*.99 + 200*0 + 1000*.01 + + 320*0 = 1 eur/share on average
For this dataset, as the naive bayes works really good with 99% accuracy, the cost will be so small.
Savings = (post increase incentive - cost)
in the case of the naive classifier, the savings = 199 eur/post, i.e. a big gain. As the rest of the models have demostrated a smaller accuracy, we are not going to consider them.
So, if we make 10000 posts we could be gaining around 2.000.000 euros in investing advertising, which will conclude with lots of shares and promotion for any post that we promote / advertise.
Let´s now continue using again caret to try classical machine learning approaches applied to classification:
library(caret)
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
model = shares ~.
control = rpart.control(minsplit = 40, maxdepth = 12, cp=0.001)
dtFit <- rpart(model, data=dataTrain_csf_bin, method = "class", control = control)
caret.fit <- train(model,
data = dataTrain_csf_bin,
method = "rpart",
control = control,
trControl = trainControl(method = "cv", number = 10),
tuneLength=10)
caret.fit## CART
##
## 31716 samples
## 48 predictor
## 2 classes: 'FewLikes', 'LotLikes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 28545, 28545, 28544, 28545, 28544, 28544, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.002257721 0.6404343 0.2805730
## 0.002428115 0.6401821 0.2801055
## 0.003546326 0.6353580 0.2713267
## 0.003642173 0.6333715 0.2672693
## 0.004664537 0.6246380 0.2500406
## 0.005015974 0.6229038 0.2467485
## 0.008051118 0.6171652 0.2350191
## 0.013546326 0.6126567 0.2247119
## 0.034760383 0.5943370 0.1885777
## 0.177827476 0.5635016 0.1233526
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.002257721.
endParallel(cl);## [1] "The model has finished."
toc()## 20.78 sec elapsed
Now we are predicting in the binary categorical set, and the accuracy is not bad at all: We are getting that the optimal value for the complexity parameter is cp = 0.002811502; The accuracy is almost 63% and kappa is just 27%. If we improve the hyper parameters we will probably get better results.
Visualization of the tree:
library(rpart.plot)
rpart.plot(caret.fit$finalModel)Prediction
dtProb <- predict(caret.fit, dataTest_csf_bin, type = "prob")
threshold = 0.4
dtPred = rep("FewLikes", nrow(dataTest_csf_bin))
dtPred[which(dtProb[,2] > threshold)] = "LotLikes"
summary(dtPred)## Length Class Mode
## 7928 character character
CM = confusionMatrix(table(dtPred, dataTest_csf_bin$shares))$table
cost = sum(as.vector(CM)*cost.unit)/sum((CM)); cost## [1] 296.7255
Here thanks to the previous analysis we are computing the cost, positive by the way, that will take each of the posts that we as a company intend to promote, the closer the threshold is to 0.5, as we increase the accuracy, the cost is reduced, if we increase much the probability, the cost will also be increased.
Now, we are gonna compute the cost but using random forests:
This function that we have used in class, alows us to compute the cost fastest, avoiding time consume:
EconomicCost <- function(data, lev = NULL, model = NULL)
{
y.pred = data$pred
y.true = data$obs
CM = confusionMatrix(y.pred, y.true)$table
out = sum(as.vector(CM)*(c(0, 200, 1000, 320)))/sum(CM)
names(out) <- c("EconomicCost")
out
}Now include this function in the Caret control:
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
ctrl <- trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = EconomicCost,
verboseIter=T,
allowParallel = TRUE)
rf.train <- train(shares ~.,
method = "rf",
data = dataTrain_csf_bin,
preProcess = c("center", "scale"),
ntree = 200,
cutoff=c(0.7,0.3),
tuneGrid = expand.grid(mtry=c(8,10,15)),
metric = "EconomicCost",
maximize = F,
trControl = ctrl)## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 8 on full training set
endParallel(cl)## [1] "The model has finished."
toc()## 215.61 sec elapsed
We can improve a lot the accuracy of this random forest by increasing the number of cross validations, cv, and the number of minimum trees on each forest, that is the mtry parameter. This is probably with gradient boosting one of the tools that is more time consuming. Despite this I have tried to improve it a bit and it is possible thanks to parallel processing, that saves much time.
Variable importance:
rf_imp <- varImp(rf.train, scale = F)
plot(rf_imp, scales = list(y = list(cex = .95)))This part does not seem, but it is probably one of the most important ones for our company. Here we can see which variables are more important for our cost on the data set. With difference, the first ones are kw_avg_avg and kw_max_avg. Both are the average value of shares and the maximum value of shares that a post receives and obviously, this ones are so important due to the fact, that depending on the number of mean shares that there are, the more or less shares our post will take in that website.
Prediction:
rfPred = predict(rf.train, newdata=dataTest_csf_bin)
CM = confusionMatrix(table(rfPred, dataTest_csf_bin$shares))$table
cost = sum(as.vector(CM)*cost.unit)/sum(CM)
cost## [1] 254.4349
confusionMatrix(table(rfPred, dataTest_csf_bin$shares))$overall[1]## Accuracy
## 0.5881685
We are obtaining again a cost that is positive and that rounds 255 euros per post, what keeps being really good. Note that the accuracy is 60%, not as good as naive bayes, but pretty decent.
xgb_grid = expand.grid(
nrounds = c(500,1000),
eta = c(0.01, 0.001), # c(0.01,0.05,0.1)
max_depth = c(2, 4, 6),
gamma = 1,
colsample_bytree = c(0.2, 0.4),
min_child_weight = c(1,5),
subsample = 1
)Then, train
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
ctrl <- trainControl(method = "cv", number = 2,
classProbs = TRUE,
summaryFunction = EconomicCost,
verboseIter=T)
xgb.train = train(shares ~ ., data=dataTrain_csf_bin,
trControl = ctrl,
metric="EconomicCost",
maximize = F,
tuneGrid = xgb_grid,
preProcess = c("center", "scale"),
method = "xgbTree"
)## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 1000, max_depth = 6, eta = 0.01, gamma = 1, colsample_bytree = 0.2, min_child_weight = 5, subsample = 1 on full training set
endParallel(cl)## [1] "The model has finished."
toc()## 195.15 sec elapsed
With gradient boosting, the most time consumer model that we know up to now, and with the few parameters that we have tried, I have obtained the following fit values: Fitting nrounds = 500, max_depth = 6, eta = 0.01, gamma = 1, colsample_bytree = 0.2, min_child_weight = 5, subsample = 1 on full training set
Variable importance:
xgb_imp <- varImp(xgb.train, scale = F)
plot(xgb_imp, scales = list(y = list(cex = .95)))Most important variables keep been more or less the same ones, Prediction and cost
threshold = 0.2
xgbProb = predict(xgb.train, newdata=dataTest_csf_bin, type="prob")
xgbPred = rep("FewLikes", nrow(dataTest_csf_bin))
xgbPred[which(xgbProb[,2] > threshold)] = "LotLikes"
CM = confusionMatrix(table(xgbPred, dataTest_csf_bin$shares))$table
cost = sum(as.vector(CM)*cost.unit)/sum(CM); cost## [1] 253.9102
confusionMatrix(table(xgbPred, dataTest_csf_bin$shares))$overall[1]## Accuracy
## 0.5417508
As in rf, we get 256 euros/post in Cost and 51% of accuracy, I keep insisting that if we had more resources, upgrading the tunning grid will provide us more and more accuracy, probably up to a 20% increase.
Now we are going to compute a really intelligent movement that is: instead of keeping a model that performs better, let´s try to compute or keep the 3 best ones and get their average. In practice thanks to this easy step, we get a global better accuracy because thanks to mixing, the model fits better in new data, in future data and we can get higher performances.
mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}Prediction
ensemble.pred = apply(data.frame(dtPred, xgbPred, rfPred), 1, mode)
CM1 = confusionMatrix(factor(ensemble.pred), dataTest_csf_bin$shares)$table
cost = sum(as.vector(CM)*cost.unit)/sum(CM1); cost## [1] 253.9102
We can see that the output is 256 euros/post, that is the average of all the models.
It has been a curious thing to compute classification approach with this data set. It may seem in some cases not too relevant because this data originally was made for regression, but I think that we have been capable of getting really convenient insights and at the end I realized that the data was also cool for this type of supervised learning.
We know for which channels or days or the weeks, the number of shares in our posts will be higher, we know what variables are most important depending on the model approach that we use and a really cool thing is that since an economic point of view, we have been capable of guessing how will work this data if we were a company that wants to public something in social media or any other channel, advertise it and earn money with it.
Naive Bayes has been the best model with many difference but the fact that accuracy and kappa are that high does not seem to be a really good think, maybe over fitting is related to that. I have decided to use this many variables, around 40, because many models work well with it, but probably if we spend more time removing a few and configuring well the hyper parameters, our results would be better.
splitting = createDataPartition(data$shares, p = 0.9, list = FALSE) # 80% for training and 20% for testing
dataModels = data[splitting,]
dataFinalTesting = data[-splitting,] #Only for regression, as variables are not transformed.
splitting_2 = createDataPartition(dataModels$shares, p = 0.7, list = FALSE)
dataTrain = dataModels[splitting_2,]
dataTest = dataModels[-splitting_2,]ggplot(data=dataTrain) + aes(x = shares) + geom_histogram(fill = "darkgreen") + ggtitle("Total shares")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can see that there are many asymmetric variables and we can just see very extreme values. Most of the posts receive about 1400 shares, and they oscillate between 1000 and 3000, we can see that there are also some outliers, let´s see it on a boxplot:
summary(dataTrain$shares)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5 946 1400 3372 2800 843300
ggplot(data=dataTrain) + aes(y = shares) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares")ggplot(data=dataTrain) + aes(y = shares) + ylim(0,10000) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares")## Warning: Removed 1359 rows containing non-finite values (stat_boxplot).
dataTrain<-dataTrain[!(dataTrain$shares >= 3500),] #removing rows with shares greater than 5000
ggplot(data=dataTrain) + aes(y = shares) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares") We have needed to implement the ylim in order to see something because there are some post with almost a million shares, that in comparison with the average and the median is a clear outlier. I am going to remove all the posts with more than 3500 shares by the moment, as many of them keep being outliers but are more or less close to the third quartile.
Let´s see a density plot now that corroborates the previous information:
ggplot(data=dataTrain) + aes(x = shares) + geom_density(fill = "darkgreen") + ggtitle("Total shares")ggplot(data=dataTrain) + aes(x = log(shares + 10^4)) + geom_density(fill = "darkgreen") + ggtitle("Total shares distribution") I have applied also the logarithm in order to see it more linearly, also adding some shares to the x axis, so we can see better the end of the curve.
Let´s analyze how the day of the week influences on the number of shares
ggplot(dataTrain, aes(fill = weekday, y = shares)) + geom_boxplot() p = ggplot(dataTrain)+aes(x=shares, fill = weekday) + geom_density(alpha = 0.5) + ggtitle("Shares depending on the day of the week"); ggplotly(p)This two plots are really useful, by clicking on each day of the week, we can see the number of shares on each day, and as expected, on the weekend the number of shares is higher, this is due to the over activity during the days of free time, where people is staying with the family or enjoying their free their time, which means checking on the media any posts or comments.
Now let´s focus on the kind of articles that receive more likes/shares:
ggplot(dataTrain, aes(fill = channel, y = shares)) + geom_boxplot() p = ggplot(dataTrain)+aes(x=shares, fill = channel)+geom_density(alpha = 0.5) + ggtitle("Shares depending on the channel"); ggplotly(p)World theme, follow by Entertainment are the least shared channels, while Social-Media and Others (any subcathegories such as Lifestyle) are the highest shared ones, again not by surprise. This means that the most shared posts are the ones people share in their profiles, instead the ones that are related to the news, the real world and people´s entertainment, that have a higher density but a quite lower number of shares.
Let´s see if can relate or see something in relation with numerical variables:
p = ggplot(dataTrain) + aes(x = average_token_length, y = self_reference_avg_sharess, color = channel)+
geom_point(size = 2); ggplotly(p)p = ggplot(dataTrain) + aes(x = average_token_length, y = self_reference_avg_sharess, color = weekday)+
geom_point(size = 2); ggplotly(p)As before, we can see that if we click and separate in the plot depending on weekday or channel, social media and other, have the highest average number of shares. There is not a huge difference, but if we check carefully it can be appreciated.
First, let´s check which are the variables that present a higher correlation with the target variable, “shares”:
dataTrain$channel = as.numeric(as.factor(dataTrain$channel))
dataTrain$weekday = as.numeric(as.factor(dataTrain$weekday))
dataTest$channel = as.numeric(as.factor(dataTest$channel))
dataTest$weekday = as.numeric(as.factor(dataTest$weekday))
corr_delay = sort(cor(dataTrain[,c(1:47)])["shares",], decreasing = T) # I am skipping the last 3 categorical values.
corr = data.frame(corr_delay)
p = ggplot(corr,aes(x = row.names(corr), y = corr_delay)) + geom_bar(stat = "identity", fill = "darkgreen") +
scale_x_discrete(limits= row.names(corr)) + labs(x = "", y = "Shares", title = "Correlations between the variables and Shares obtained") +
theme(plot.title = element_text(hjust = 0), axis.text.x = element_text(angle = 45, hjust = 1)); ggplotly(p)We are including also the variable shares, whose correlation to itself is obviously 1, so that if we don´t put it and we scale, visually we are going to think that kw_avg_avg variable has a really high correlation when it is just around 0.08. From here, we can also see that the variable LDA_02 is negatively correlated to Shares in a High value.
“kw_avg_avg” and “is_weekend” seem to be the most correlated variables.
Nevertheless, let´s make some zoom in and check better the correlations:
corr_shares <- sort(cor(dataTrain[,c(21, 29, 24, 22, 23,20, 40 ,28, 48, 49, 47)])["shares",], decreasing = T)
corr=data.frame(corr_shares)
p=ggplot(corr,aes(x = row.names(corr), y = corr_shares)) +
geom_bar(stat = "identity", fill = "darkgreen") +
scale_x_discrete(limits= row.names(corr)) +
labs(x = "Predictors", y = "Shares", title = "Correlations of the Variables with Shares") +
theme(plot.title = element_text(hjust = 0, size = rel(1.5)),
axis.text.x = element_text(angle = 45, hjust = 1)); ggplotly(p)When can see, that in this data set, correlation between variables is quite high, we saw it at the beginning in the correlations matrix, so this is not a surprise. There will be almost no collinearity, but it does not mean that we do not have noise because having many variables as we do can end up with overfiting, for some models that is good. (for the ones where we only care about predicting with good R^2).
First of all, let´s compute the easiest model we could, simple regression:
linFit <- lm((shares) ~ kw_avg_avg, data=dataTrain); summary(linFit)##
## Call:
## lm(formula = (shares) ~ kw_avg_avg, data = dataTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3165.7 -520.7 -188.3 383.2 2120.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.173e+03 1.357e+01 86.40 <2e-16 ***
## kw_avg_avg 7.598e-02 4.190e-03 18.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 708.2 on 20066 degrees of freedom
## Multiple R-squared: 0.01612, Adjusted R-squared: 0.01607
## F-statistic: 328.8 on 1 and 20066 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(linFit, pch=23 ,bg='orange',cex=2)kw_avg_avg: Are the keyword average shares.
We can see that R2 is 2%, what is really really bad. This is as expected, because as we use a simple linear model, the R2, that is the percentage of variability explained, that allow us to measure the accuracy or quality of a model, is going to be the even lower than the correlation between both variables. We need to include more parameters to estimate our results.
We can say that the relation between shares and kw_avg_avg (average shares on keyword) is non-linear, so we need to use a non-linear tool, because we can see that for linear ones, the performance is bad. Insights?
As in classification, let´s apply regression but introducing the logarithm:
linFit <- lm(log(shares) ~kw_avg_avg, data=dataTrain); summary(linFit) ##
## Call:
## lm(formula = log(shares) ~ kw_avg_avg, data = dataTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4527 -0.3390 -0.0149 0.3703 1.1313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.940e+00 1.009e-02 687.82 <2e-16 ***
## kw_avg_avg 5.822e-05 3.115e-06 18.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5264 on 20066 degrees of freedom
## Multiple R-squared: 0.01712, Adjusted R-squared: 0.01707
## F-statistic: 349.5 on 1 and 20066 DF, p-value: < 2.2e-16
We can see that R2 keeps being the same one, this means that in order to predict the number of shares in the media, as they depend on many variables, we will need to use really advance tools.
linFit1 <- lm(log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_02 + LDA_03*self_reference_min_shares, data=dataTrain) ; summary(linFit1)##
## Call:
## lm(formula = log(shares) ~ kw_avg_avg + self_reference_avg_sharess +
## (self_reference_max_shares) + LDA_02 + LDA_03 * self_reference_min_shares,
## data = dataTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5150 -0.3328 -0.0134 0.3614 1.2431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032e+00 1.159e-02 606.735 <2e-16 ***
## kw_avg_avg 5.726e-05 3.523e-06 16.255 <2e-16 ***
## self_reference_avg_sharess -1.039e-06 8.633e-07 -1.204 0.2287
## self_reference_max_shares 6.888e-07 3.466e-07 1.987 0.0469 *
## LDA_02 -2.508e-01 1.341e-02 -18.708 <2e-16 ***
## LDA_03 -1.650e-01 1.523e-02 -10.831 <2e-16 ***
## self_reference_min_shares 1.175e-06 6.211e-07 1.892 0.0586 .
## LDA_03:self_reference_min_shares -7.287e-07 1.060e-06 -0.687 0.4918
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.521 on 20060 degrees of freedom
## Multiple R-squared: 0.03732, Adjusted R-squared: 0.03699
## F-statistic: 111.1 on 7 and 20060 DF, p-value: < 2.2e-16
linFit2 <- lm(log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
LDA_01 + LDA_02 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + title_sentiment_polarity + title_subjectivity + n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares, data=dataTrain) ; summary(linFit2)##
## Call:
## lm(formula = log(shares) ~ kw_avg_avg + self_reference_avg_sharess +
## (self_reference_max_shares) + LDA_00 + LDA_01 + LDA_02 +
## timedelta + n_unique_tokens + num_hrefs + channel + weekday +
## num_videos + abs_title_sentiment_polarity + title_sentiment_polarity +
## title_subjectivity + n_tokens_title + LDA_04 + LDA_03 * self_reference_min_shares,
## data = dataTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3535 -0.3198 -0.0110 0.3493 1.3166
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.900e+00 3.536e-02 195.133 < 2e-16 ***
## kw_avg_avg 5.514e-05 3.547e-06 15.548 < 2e-16 ***
## self_reference_avg_sharess 5.070e-07 8.510e-07 0.596 0.551351
## self_reference_max_shares -5.912e-08 3.425e-07 -0.173 0.862964
## LDA_00 3.031e-01 1.959e-02 15.472 < 2e-16 ***
## LDA_01 -4.001e-02 2.108e-02 -1.898 0.057689 .
## LDA_02 -2.354e-01 2.176e-02 -10.822 < 2e-16 ***
## timedelta 3.542e-05 1.814e-05 1.953 0.050818 .
## n_unique_tokens -2.148e-01 2.808e-02 -7.650 2.1e-14 ***
## num_hrefs 3.752e-03 3.499e-04 10.725 < 2e-16 ***
## channel 3.182e-02 2.678e-03 11.884 < 2e-16 ***
## weekday -1.466e-02 1.680e-03 -8.727 < 2e-16 ***
## num_videos 3.766e-04 9.518e-04 0.396 0.692324
## abs_title_sentiment_polarity -1.865e-02 2.525e-02 -0.739 0.460060
## title_sentiment_polarity 5.673e-02 1.539e-02 3.685 0.000229 ***
## title_subjectivity 4.505e-02 1.640e-02 2.747 0.006028 **
## n_tokens_title -1.266e-03 1.785e-03 -0.709 0.478364
## LDA_04 1.846e-01 1.892e-02 9.759 < 2e-16 ***
## LDA_03 NA NA NA NA
## self_reference_min_shares 3.296e-07 6.101e-07 0.540 0.589055
## LDA_03:self_reference_min_shares -6.107e-07 1.042e-06 -0.586 0.557927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5103 on 20048 degrees of freedom
## Multiple R-squared: 0.07706, Adjusted R-squared: 0.07618
## F-statistic: 88.09 on 19 and 20048 DF, p-value: < 2.2e-16
#With the star symbol, R is considering 3 columns, and using them.
pr.multiple = exp(predict(linFit2, newdata=dataTest))## Warning in predict.lm(linFit2, newdata = dataTest): prediction from a rank-
## deficient fit may be misleading
cor(dataTest$shares, pr.multiple)^2## [1] 0.008220323
R2 is now 7.3%, what means that we have achieved to triple our prediction accuracy, our R^2, the problem is that it keeps been so low despite we have included lot of variables. We need to increase the models complexity.
I predicted just to consolidate the results, but this accuracy would be useless in any company.
#model = log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_02 + LDA_03*self_reference_min_shares + I(LDA_03^2) + I(self_reference_min_shares^2)
#model = (log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
# LDA_01 + LDA_02 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + I(title_sentiment_polarity^2) + title_subjectivity + n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares + I(LDA_03^2) + I(self_reference_min_shares^2))
#
#
# model = (shares~ . - abs_title_sentiment_polarity)
model = (log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
LDA_01 + LDA_02+ LDA_04 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + I(title_sentiment_polarity^2) + title_subjectivity + n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares + I(LDA_03^2) + I(self_reference_min_shares^2) + I(kw_avg_avg^2)+ I(self_reference_avg_sharess^2)+ I(LDA_00^2)+ I(LDA_01^2)+ I(LDA_02^2)+ I(LDA_04^2)+ I(timedelta^2)+ I(n_unique_tokens^2)+ I(num_hrefs^2)+ I(channel^2)+ I(weekday^2)+ I(num_videos^2) + I(abs_title_sentiment_polarity^2) )
#This last one is the model that has given me a higher performance
linFit <- lm(model, data=dataTrain); summary(linFit)##
## Call:
## lm(formula = model, data = dataTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4323 -0.3122 -0.0119 0.3408 1.4117
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.484e+00 7.323e-02 88.546 < 2e-16 ***
## kw_avg_avg 1.082e-04 6.342e-06 17.062 < 2e-16 ***
## self_reference_avg_sharess 4.332e-06 1.285e-06 3.372 0.000746 ***
## self_reference_max_shares -4.522e-07 3.618e-07 -1.250 0.211419
## LDA_00 3.164e-01 8.509e-02 3.719 0.000201 ***
## LDA_01 -1.502e-01 9.353e-02 -1.606 0.108353
## LDA_02 3.061e-02 8.347e-02 0.367 0.713801
## LDA_04 1.389e-01 8.052e-02 1.726 0.084451 .
## timedelta -3.862e-04 7.082e-05 -5.453 5.01e-08 ***
## n_unique_tokens 1.623e-01 9.453e-02 1.717 0.085912 .
## num_hrefs 3.221e-03 6.574e-04 4.899 9.71e-07 ***
## channel 1.484e-01 1.309e-02 11.337 < 2e-16 ***
## weekday 5.001e-02 8.752e-03 5.714 1.12e-08 ***
## num_videos 3.145e-03 1.687e-03 1.864 0.062396 .
## abs_title_sentiment_polarity -8.432e-02 5.398e-02 -1.562 0.118310
## I(title_sentiment_polarity^2) 1.123e-01 6.043e-02 1.858 0.063235 .
## title_subjectivity 5.048e-02 1.771e-02 2.851 0.004368 **
## n_tokens_title 5.509e-04 1.781e-03 0.309 0.757154
## LDA_03 NA NA NA NA
## self_reference_min_shares 5.177e-07 1.153e-06 0.449 0.653335
## I(LDA_03^2) -8.733e-02 6.871e-02 -1.271 0.203726
## I(self_reference_min_shares^2) 2.595e-12 2.528e-12 1.026 0.304823
## I(kw_avg_avg^2) -4.231e-09 3.802e-10 -11.128 < 2e-16 ***
## I(self_reference_avg_sharess^2) -8.904e-12 2.398e-12 -3.713 0.000206 ***
## I(LDA_00^2) 5.731e-02 7.059e-02 0.812 0.416887
## I(LDA_01^2) 1.047e-01 7.697e-02 1.361 0.173578
## I(LDA_02^2) -1.733e-01 6.454e-02 -2.685 0.007258 **
## I(LDA_04^2) 5.080e-02 6.373e-02 0.797 0.425330
## I(timedelta^2) 6.114e-07 9.409e-08 6.497 8.37e-11 ***
## I(n_unique_tokens^2) -4.428e-01 1.046e-01 -4.234 2.31e-05 ***
## I(num_hrefs^2) -7.851e-06 8.136e-06 -0.965 0.334549
## I(channel^2) -1.526e-02 1.661e-03 -9.190 < 2e-16 ***
## I(weekday^2) -8.112e-03 1.074e-03 -7.556 4.32e-14 ***
## I(num_videos^2) -9.283e-05 4.319e-05 -2.149 0.031616 *
## I(abs_title_sentiment_polarity^2) NA NA NA NA
## LDA_03:self_reference_min_shares -3.735e-06 1.129e-06 -3.307 0.000944 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5052 on 20034 degrees of freedom
## Multiple R-squared: 0.09619, Adjusted R-squared: 0.0947
## F-statistic: 64.61 on 33 and 20034 DF, p-value: < 2.2e-16
We have squared two non-linear variables to check for improvements. As we are introducing tones of variables, we are going to have many betas and probably the noise is going to be also quite high, we are going to be predicting basically noise, but this is actually the best way to get a greater R^2 in this set. Seems reasonable. R2 in testing is similar to that in training
The benchmark is our reference point and for us, it is going to be the prediction using the mean number of shares. Our models so demonstrate an accuray higher than the one in here:
mean(dataTrain$shares)## [1] 1401.567
benchFit <- lm(shares ~ 1, data=dataTrain)
predictions <- predict(benchFit, newdata=dataTest)
cor(dataTest$shares, predictions)^2## Warning in cor(dataTest$shares, predictions): the standard deviation is zero
## [1] NA
RMSE = sqrt(mean((predictions - dataTest$shares)^2))
RMSE## [1] 8966.834
We obtain that the root mean square error; ie, RMSE, is 11288, so all models should be better than this one, otherwise they will not be useful.
#Repeated Cross Validation with Caret
We are using now caret to try better outputs using cv:
ctrl <- trainControl(method = "repeatedcv",
number = 5, repeats = 5, allowParallel = TRUE)
test_results <- data.frame(shares = log(dataTest$shares))cl = startParallel();cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
lm_tune <- train(model, data = dataTrain,
method = "lm",
preProc=c('scale', 'center'),
trControl = ctrl); lm_tune## Linear Regression
##
## 20068 samples
## 19 predictor
##
## Pre-processing: scaled (35), centered (35)
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 16055, 16054, 16054, 16056, 16053, 16055, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.5059477 0.09217688 0.3904224
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
endParallel(cl)## [1] "The model has finished."
By the moment, I am using cv, but as there is no hyper parameter yet, we do not really use it, despite this, we have increase the R2 up to 10%, not bad at all.
As we have many variables, this model should demonstrate some better performance:
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
alm_tune <- train(model, data = dataTrain,
method = "lm",
preProc=c('scale', 'center'),
trControl = ctrl)
endParallel(cl)## [1] "The model has finished."
toc()## 14.36 sec elapsed
test_results$alm <- predict(alm_tune, dataTest)## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
postResample(pred = test_results$alm, obs = test_results$shares)## RMSE Rsquared MAE
## 0.97431799 0.07678933 0.66412598
The best model until now, had as much as 8% R^2 in the testing set and now we are getting 8.6% in the R2 despite it is so overfitted. This model will only work for this dataset, as in most of the cases. Overfitting is very dangerous, all Betas are very noisy but to compute predictions the model is not dangerous, to predict, the use of overfitting is not bad. You cannot understand why it is working.
Visualization
qplot(test_results$alm, test_results$shares) +
labs(title="Linear Regression Observed VS Predicted", x ="Predicted", y="Observed") +
geom_abline(intercept = 0, slope = 1, colour = "green") +
theme_bw()We can see that as we knew, our model is definitely not linear. The variability is smaller and points are closer to the line. You can use this graph in any data set in the world.
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
for_tune = train(model, data = dataTrain,
method = "leapForward",
preProc=c('scale', 'center'),
tuneGrid = expand.grid(nvmax = 2:30),
trControl = ctrl)## Warning in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : 2
## linear dependencies found
## Reordering variables and trying again:
endParallel(cl)## [1] "The model has finished."
toc()## 14.92 sec elapsed
for_tune## Linear Regression with Forward Selection
##
## 20068 samples
## 19 predictor
##
## Pre-processing: scaled (35), centered (35)
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 16055, 16055, 16053, 16055, 16054, 16056, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE
## 2 0.5262967 0.01733318 0.4112015
## 3 0.5215030 0.03559206 0.4068019
## 4 0.5207392 0.03839544 0.4060967
## 5 0.5200965 0.04077445 0.4054602
## 6 0.5188108 0.04543154 0.4041540
## 7 0.5185800 0.04627704 0.4039840
## 8 0.5182917 0.04732302 0.4037631
## 9 0.5173418 0.05075231 0.4025720
## 10 0.5169039 0.05234667 0.4020954
## 11 0.5158906 0.05607740 0.4010781
## 12 0.5154417 0.05767342 0.4005640
## 13 0.5149989 0.05929044 0.4001299
## 14 0.5144764 0.06121718 0.3992399
## 15 0.5140504 0.06282255 0.3989035
## 16 0.5124246 0.06878128 0.3972000
## 17 0.5115730 0.07186506 0.3963575
## 18 0.5101998 0.07680829 0.3952209
## 19 0.5093830 0.07970564 0.3944875
## 20 0.5090916 0.08074721 0.3942032
## 21 0.5085155 0.08280889 0.3936421
## 22 0.5080750 0.08441037 0.3931503
## 23 0.5078089 0.08535071 0.3928102
## 24 0.5076237 0.08601358 0.3925764
## 25 0.5075106 0.08642649 0.3924365
## 26 0.5073140 0.08714035 0.3922425
## 27 0.5070976 0.08790967 0.3919777
## 28 0.5069159 0.08855719 0.3917537
## 29 0.5067626 0.08913374 0.3915465
## 30 0.5062447 0.09096086 0.3909802
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was nvmax = 30.
plot(for_tune)Consider that nvmax is the max number of variables you need to take, this is the hyper parameter, and we can see that in our case, the larger it is, the better.
#R is concluding that nvmax optimal is 30 . In the plot we can see what happens with other values by comparin with the error. This is expensive because uses cv and each time one block of train is removed and then it tests on the train. The RMSE error keeps been high, but we almost achieve 10% of variability explained.
Now we are going to focus on knowing and understanding which Betas are better, this is part of the estimators interpretation:
coef(for_tune$finalModel, for_tune$bestTune$nvmax)## (Intercept) kw_avg_avg
## 7.114920882 0.118728510
## self_reference_avg_sharess self_reference_max_shares
## 0.079700798 -0.014264076
## LDA_00 LDA_01
## 0.106874679 -0.016789880
## LDA_02 timedelta
## 0.039128737 0.012109364
## n_unique_tokens num_hrefs
## 0.020439316 0.030841639
## channel weekday
## 0.333550177 0.107389863
## num_videos abs_title_sentiment_polarity
## 0.010083346 -0.018761285
## I(title_sentiment_polarity^2) title_subjectivity
## 0.018295695 0.016243771
## self_reference_min_shares I(self_reference_min_shares^2)
## 0.015154254 0.014217734
## I(kw_avg_avg^2) I(self_reference_avg_sharess^2)
## -0.068449280 -0.077485594
## I(LDA_00^2) I(LDA_01^2)
## 0.001041659 0.013611182
## I(LDA_02^2) I(LDA_04^2)
## -0.061533372 0.063818694
## I(n_unique_tokens^2) I(num_hrefs^2)
## -0.053994478 -0.004032317
## I(channel^2) I(weekday^2)
## -0.280251579 -0.141886490
## I(num_videos^2) LDA_03:self_reference_min_shares
## -0.012366897 -0.017832753
## I(abs_title_sentiment_polarity^2)
## 0.000000000
Betas for 30 variables
Predict
test_results$frw <- predict(for_tune, dataTest)
postResample(pred = test_results$frw, obs = test_results$shares)## RMSE Rsquared MAE
## 0.97423601 0.07725199 0.66432016
qplot(test_results$frw, test_results$shares) +
labs(title="Forward Regression Observed VS Predicted", x="Predicted", y="Observed") +
geom_abline(intercept = 0, slope = 1, colour = "green") +
theme_bw()As our estimation keeps been so low, we are not getting most of the points closer to this line, we will have more success on machine learning when we implement totally non-linear models.
Train
tic()
cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
back_tune <- train(model, data = dataTrain,
method = "leapBackward",
preProc=c('scale', 'center'),
tuneGrid = expand.grid(nvmax = 2:31),
trControl = ctrl)## Warning in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : 2
## linear dependencies found
## Reordering variables and trying again:
back_tune## Linear Regression with Backwards Selection
##
## 20068 samples
## 19 predictor
##
## Pre-processing: scaled (35), centered (35)
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 16055, 16054, 16054, 16055, 16054, 16055, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE
## 2 0.5275290 0.01292352 0.4130581
## 3 0.5244697 0.02446124 0.4098899
## 4 0.5212534 0.03615477 0.4060028
## 5 0.5156931 0.05677509 0.4005480
## 6 0.5154281 0.05775769 0.4002703
## 7 0.5147954 0.06003049 0.3996366
## 8 0.5144708 0.06123314 0.3993029
## 9 0.5138587 0.06349778 0.3987217
## 10 0.5136689 0.06420252 0.3984384
## 11 0.5132671 0.06570189 0.3981862
## 12 0.5129813 0.06672869 0.3978710
## 13 0.5124145 0.06884447 0.3972017
## 14 0.5122554 0.06943411 0.3968911
## 15 0.5106277 0.07530200 0.3954261
## 16 0.5098207 0.07821403 0.3948219
## 17 0.5096750 0.07876082 0.3946580
## 18 0.5094384 0.07956856 0.3945113
## 19 0.5093167 0.08002506 0.3943264
## 20 0.5091867 0.08046566 0.3942253
## 21 0.5087228 0.08208804 0.3937744
## 22 0.5086971 0.08220189 0.3937359
## 23 0.5083845 0.08328864 0.3933955
## 24 0.5081981 0.08396323 0.3932225
## 25 0.5080261 0.08456725 0.3930284
## 26 0.5078230 0.08530435 0.3928369
## 27 0.5076350 0.08598905 0.3926209
## 28 0.5071824 0.08767480 0.3920996
## 29 0.5068364 0.08892162 0.3916231
## 30 0.5065445 0.08995907 0.3912851
## 31 0.5060923 0.09162725 0.3906994
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was nvmax = 31.
plot(back_tune)endParallel(cl); toc()## [1] "The model has finished."
## 12.09 sec elapsed
As before, now we compute which are the chosen variables, this time we are going backwards, but the results are the same ones.
coef(back_tune$finalModel, back_tune$bestTune$nvmax)## (Intercept) kw_avg_avg
## 7.114920882 0.128585647
## self_reference_avg_sharess self_reference_max_shares
## 0.099794266 -0.020286416
## LDA_00 LDA_01
## 0.078606761 -0.037637327
## LDA_04 timedelta
## 0.036390136 -0.083690516
## n_unique_tokens num_hrefs
## 0.021708235 0.034101126
## channel weekday
## 0.334213224 0.107371741
## num_videos abs_title_sentiment_polarity
## 0.012414035 -0.018257455
## I(title_sentiment_polarity^2) title_subjectivity
## 0.018013943 0.016098700
## I(LDA_03^2) I(self_reference_min_shares^2)
## -0.025016886 0.031459974
## I(kw_avg_avg^2) I(self_reference_avg_sharess^2)
## -0.073769276 -0.094588139
## I(LDA_00^2) I(LDA_01^2)
## 0.011991181 0.018771902
## I(LDA_02^2) I(LDA_04^2)
## -0.038724766 0.011180526
## I(timedelta^2) I(n_unique_tokens^2)
## 0.099622595 -0.054584948
## I(num_hrefs^2) I(channel^2)
## -0.006219745 -0.284549630
## I(weekday^2) I(num_videos^2)
## -0.142027403 -0.013895439
## LDA_03:self_reference_min_shares I(abs_title_sentiment_polarity^2)
## -0.015911118 0.000000000
test_results$bw <- predict(back_tune, dataTest)
postResample(pred = test_results$bw, obs = test_results$shares)## RMSE Rsquared MAE
## 0.97431290 0.07689517 0.66412417
qplot(test_results$bw, test_results$shares) +
labs(title="Backward Regression Observed VS Predicted", x="Predicted", y="Observed") +
geom_abline(intercept = 0, slope = 1, colour = "blue") +
theme_bw()More or less same results, nothing so good, R2 keeps being between 8 and 10%, so not improvement.
tic(); cl = startParallel(); cl
step_tune <- train(model, data = dataTrain,
method = "leapSeq",
preProc=c('scale', 'center'),
tuneGrid = expand.grid(nvmax = 2:31),
trControl = ctrl)
endParallel(cl); toc()
plot(step_tune)
# which variables are selected?
coef(step_tune$finalModel, step_tune$bestTune$nvmax)
test_results$seq <- predict(step_tune, dataTest)
postResample(pred = test_results$seq, obs = test_results$shares)
library(elasticnet)
library(caret)
ridge_grid <- expand.grid(lambda = seq(0, .1, length = 100))
tic(); cl = startParallel(); cl
ridge_tune <- train(model, data = dataTrain,
method='ridge',
preProc=c('scale','center'),
tuneGrid = ridge_grid,
trControl=ctrl)
endParallel(cl); toc()
plot(ridge_tune)
ridge_tune$bestTune
test_results$ridge <- predict(ridge_tune, dataTest)
postResample(pred = test_results$ridge, obs = test_results$shares)
In both, step wise and ridge regression, we keep getting more or less the same R2, despite this time it has taken a lot more of computational power. This is the output:
> postResample(pred = test_results$ridge, obs = test_results$shares)
RMSE Rsquared MAE
0.97764666 0.08547507 0.66715508
ctrl <- trainControl(method = "repeatedcv",
number = 5, repeats = 5, allowParallel = TRUE)lasso_grid <- expand.grid(fraction = seq(.01, 1, length = 5))
tic(); cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
lasso_tune <- train(model, data = dataTrain,
method='lasso',
preProc=c('scale','center'),
tuneGrid = lasso_grid,
trControl=ctrl)
endParallel(cl); toc()## [1] "The model has finished."
## 15.04 sec elapsed
plot(lasso_tune)lasso_tune$bestTune## fraction
## 5 1
test_results$lasso <- predict(lasso_tune, dataTest)
postResample(pred = test_results$lasso, obs = test_results$shares)## RMSE Rsquared MAE
## 0.97431799 0.07678933 0.66412598
We can see Lasso is the same except the penalty that is that ridge uses the square and with this model we remove the square is like the absolute value. Best lambda here will be close again to 0, different shape but same R2. It is very difficult to improve results using linear formulas.
This is the last model I am going to try, because as our linear model is quite bad, we are not gonna make many improvements.
modelLookup('glmnet')## model parameter label forReg forClass probModel
## 1 glmnet alpha Mixing Percentage TRUE TRUE TRUE
## 2 glmnet lambda Regularization Parameter TRUE TRUE TRUE
tic(); cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
elastic_grid = expand.grid(alpha = seq(0, .2, 0.01), lambda = seq(0, .1, 0.01))
glmnet_tune <- train(model, data = dataTrain,
method='glmnet',
preProc=c('scale','center'),
tuneGrid = elastic_grid,
trControl=ctrl)
endParallel(cl); toc()## [1] "The model has finished."
## 52.24 sec elapsed
plot(glmnet_tune)glmnet_tune$bestTune## alpha lambda
## 144 0.13 0
test_results$glmnet <- predict(glmnet_tune, dataTest)
postResample(pred = test_results$glmnet, obs = test_results$shares)## RMSE Rsquared MAE
## 0.97441221 0.07672888 0.66424899
Now, This grid is in 2D, one parameter for regression and other for lassu. It is more expensive, takes a bit more but is fast yet. We havent improve the results from the others models yet. Same R2
We apply this tools exaclty equall than in classification. Knn is not very good but in this case for houses it is going to perform really good.
modelLookup('kknn')## model parameter label forReg forClass probModel
## 1 kknn kmax Max. #Neighbors TRUE TRUE TRUE
## 2 kknn distance Distance TRUE TRUE TRUE
## 3 kknn kernel Kernel TRUE TRUE TRUE
tic(); cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
knn_tune <- train(model,
data = dataTrain,
method = "kknn",
preProc=c('scale','center'),
tuneGrid = data.frame(kmax=c(11,13,15,19,21),distance=2,kernel='optimal'),
trControl = ctrl)
endParallel(cl); toc()## [1] "The model has finished."
## 374.99 sec elapsed
plot(knn_tune)test_results$knn <- predict(knn_tune, dataTest)
postResample(pred = test_results$knn, obs = test_results$shares)## RMSE Rsquared MAE
## 0.98677646 0.05842808 0.67342264
#As each post does not belong to any other one, we are not getting any good results. The R2 has decrease to 6%, so no improvements have been done.
tic(); cl = startParallel(); cl## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
rf_tune <- train(model,
data = dataTrain,
method = "rf",
preProc=c('scale','center'),
trControl = ctrl,
ntree = 100,
tuneGrid = data.frame(mtry=c(1,3,5,7)),
importance = TRUE)
endParallel(cl); toc() ## [1] "The model has finished."
## 767.92 sec elapsed
plot(rf_tune)test_results$rf <- predict(rf_tune, dataTest)
postResample(pred = test_results$rf, obs = test_results$shares)## RMSE Rsquared MAE
## 0.9579239 0.1164964 0.6473019
For the first time, I am glad about getting R2 of 11%, this is because random forests use very complex trees.
Variable importance
plot(varImp(rf_tune, scale = F), scales = list(y = list(cex = .95))) AS before, LDA_03 is the most important variable, follow closely by time delta, LDA_03 with no square and num_hrefs. That are the number of reference links that a post has and time delta, that is the time between the publication of the post and the data acquisition, so, the highest times, pases, the number of shares will be also higher.
tic(); cl = startParallel(); cl
xgb_tune <- train(model,
data = dataTrain,
method = "xgbTree",
preProc=c('scale','center'),
objective="reg:squarederror",
trControl = ctrl,
tuneGrid = expand.grid(nrounds = c(100,200), max_depth = c(1,2,3), eta = c(0.01, 0.1, 1),
gamma = c(1, 2, 3) subsample = c(0.1,0.2,0.3)))
tic(); cl = startParallel(); cl
test_results$xgb <- predict(xgb_tune, dataTest)
postResample(pred = test_results$xgb, obs = test_results$shares)
The only difficulty is that this is so expensive, need to select very well the grid, in each data set, the numbers changes completely, they depend heavily of each. Is a very good tool but is very expensive for the computer.
For my case is around 10% R2 that is not better than forest but if we use 2 days of gradient boosting it will give us a higher %.
As, before, we are going to use ensemble so that we select the best model approach and get a higher accuracy. I must admit that I am quite frustrated because of the fact that I have not found a good model for the regression approach. Classification was okay, but for this set, as the most correlated variable is just about 8%, it is hard to be capable of predicting well.
Let’s summarize the MAE for all the tools
apply(test_results[-1], 2, function(x) mean(abs(x - test_results$shares)))## alm frw bw lasso glmnet knn rf
## 0.6641260 0.6643202 0.6641242 0.6641260 0.6642490 0.6734226 0.6473019
test_results$comb = (test_results$alm + test_results$knn + test_results$rf)/3
postResample(pred = test_results$comb, obs = test_results$shares)## RMSE Rsquared MAE
## 0.9673855 0.1095884 0.6547216
RMSE Rsquared MAE
0.9716005 0.1115928 0.6565718
We get a good R2 for what we have seen until now, that is up to 11%
It is important the predictions but need to know the prediction intervals to detect opportunities and be capable of advertising more or less a post in a determined place and moment that could get more shares in a cheap / expensive way. If we use a linear model, the interval is given to 95%CI automatically but in machine learning there are no formulas, so we apply common sense:
yhat = exp(test_results$comb)
head(yhat) ## [1] 1008.2539 1103.0055 1251.3116 1249.3492 938.6784 1260.0269
hist(yhat, col="darkgreen") This are the residuals of the test, let´s define the prediction intervals:
The errors are more symmetric
y = exp(test_results$shares)
error = y-yhat
hist(error, col="darkgreen")Let’s use the first 200 posts in testing to compute the noise size
noise = error[1:200]Prediction intervals: let’s fix a 90% confidence
lwr = yhat[201:length(yhat)] + quantile(noise,0.05, na.rm=T) #5%, lower part of the error.
upr = yhat[201:length(yhat)] + quantile(noise,0.95, na.rm=T)Then we need to sum the prediction. It is yhat +- the quantiles. In the 5% we sum but it is like the minus because the number is going to be negative. We use a part of the testing set to compute the quantiles and then with the intervals we use the second part, this is the final testing set.
Performance using the last prices in yhat:
predictions = data.frame(real=y[201:length(y)], fit=yhat[201:length(yhat)], lwr=lwr, upr=upr)
predictions = predictions %>% mutate(out=factor(if_else(real<lwr | real>upr,1,0)))
mean(predictions$out==1)## [1] 0.1492907
ggplot(predictions, aes(x=fit, y=real))+
geom_point(aes(color=out)) + theme(legend.position="none") + ylim(0, 40000)+
geom_ribbon(data=predictions,aes(ymin=lwr,ymax=upr),alpha=0.3) +
labs(title = "Prediction intervals", x = "prediction",y="Real Number Of Shares")## Warning: Removed 84 rows containing missing values (geom_point).
In red post that are inside the interval, and outside there are some posts, they are called opportunities. For posts above you need to advertise them because they will give you a lot of publicity and shares and for posts below you should not invest advertising a lot in them since they will not get much likes. You can increase or decrease the width by changing the %, 90% uses to be good, it depends on the company. This is the only way to compute CI using machine learning.
So, as we have been saying, the variability of the models explained in regression have been pretty low.
I have not been capable of getting more than 11% of accuracy, what seems to be a disaster. It is true that for some datasets the accuracy cannot be computed easily, I took this data from the page that I posted because classfication and regression could be performed. I think that probably there is a not so difficult way to find a model that fits properly in this data set.
By other side, as the correlation between variables is so poor, up to 8% the most, I think that is the reason why is that hard to predict something here. I have tried to check reducing the number of variables but the correlation obviously is the same and the results are the same ones too, so I do not know exactly what more to do.
Moreover, as time is limited, I have used parallel processing to compute and run the script but despite it, I have not used pretty much large grids, If I did, i think i would get a better correlation but not so high as in classification.
In binary classifications (conclusions about that are above just before regression), I have been capable of achieving a great result as it is so easy to predict if the post will have or not a number of shares greater than a threshold and classify it depending on that.
I hope you enjoyed the work and please, if there is a way to improve this poor regression accuracy by changing the model, I will like to know it!